Spaces:

Md-Hakim
/

text-summarization

Sleeping

App Files Files Community

hakim commited on Aug 11, 2024

Commit

fd31bf7

1 Parent(s): f2492e6

model evaluation added

Browse files

Files changed (15) hide show

.github/workflows/main.yaml +20 -0
Dockerfile +11 -0
README.md +61 -1
app.py +73 -0
config/config.yaml +10 -1
main.py +14 -1
requirements.txt +1 -1
research/model_evaluatoin.ipynb +300 -0
research/model_trainer.ipynb +212 -0
src/textsummarizer/config/configuration.py +21 -1
src/textsummarizer/conponents/model_evaluation.py +90 -0
src/textsummarizer/entity/config_entity.py +10 -0
src/textsummarizer/pipeline/predict.py +24 -0
src/textsummarizer/pipeline/stage_05_model_evaluation.py +13 -0
src/textsummarizer/utils/common.py +36 -1

.github/workflows/main.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://Md-Hakim:$HF_TOKEN@huggingface.co/spaces/Md-Hakim/text-summarization main

Dockerfile CHANGED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.11-slim
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . /code
+CMD ["streamlit", "run", "app.py"]

README.md CHANGED Viewed

	@@ -1 +1,61 @@
1	- ~~# text-summarization~~

+---
+title: Text Summarization
+emoji: 🐨
+colorFrom: blue
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.37.1
+app_file: app.py
+pinned: false
+license: mit
+---
+# text-summarization
+## Workflows
+1. Update config.yaml
+2. Update secrets.yaml [Optional]
+3. Update params.yaml
+4. Update the entity
+5. Update the configuration manager in src config
+6. Update the components
+7. Update the pipeline
+8. Update the main.py
+9. Update the dvc.yaml
+10. app.py
+# How to run?
+### STEPS:
+Clone the repository
+```bash
+https://github.com/HAKIM-ML/
+text-summarization
+### STEP 01- Create a conda environment after opening the repository
+```bash
+conda create -n cnncls python=3.8 -y
+```
+```bash
+conda activate cnncls
+```
+### STEP 02- install the requirements
+```bash
+pip install -r requirements.txt
+```
+```bash
+# Finally run the following command
+python app.py
+```
+Now,
+```bash
+open up you local host and port

app.py CHANGED Viewed

	@@ -0,0 +1,73 @@

+import streamlit as st
+from textsummarizer.pipeline.predict import PredictionPipeline
+def main():
+    # Set page config
+    st.set_page_config(page_title="Dialogue Summarizer", page_icon="💬", layout="wide")
+    # Custom CSS to improve the appearance
+    st.markdown("""
+    <style>
+    .big-font {
+        font-size:20px !important;
+        font-weight: bold;
+    }
+    .result-font {
+        font-size:18px !important;
+        font-style: italic;
+    }
+    .stButton>button {
+        width: 100%;
+        height: 50px;
+        font-size: 20px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    # App title and description
+    st.title("🤖 AI Dialogue Summarizer")
+    st.markdown("Transform your lengthy conversations into concise summaries with our cutting-edge AI technology.")
+    # Create two columns
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown('<p class="big-font">Input Dialogue</p>', unsafe_allow_html=True)
+        user_input = st.text_area("", height=300, placeholder="Paste your dialogue here...")
+    with col2:
+        st.markdown('<p class="big-font">Summary</p>', unsafe_allow_html=True)
+        summary_placeholder = st.empty()
+    # Create an instance of PredictionPipeline
+    predictor = PredictionPipeline()
+    if st.button("📝 Generate Summary"):
+        if user_input:
+            with st.spinner('Generating summary...'):
+                # Get the summary
+                summary = predictor.predict(user_input)
+                # Display the summary
+                summary_placeholder.markdown(f'<p class="result-font">{summary}</p>', unsafe_allow_html=True)
+        else:
+            st.warning("⚠️ Please enter some text to summarize.")
+    # Add some spacing
+    st.markdown("<br><br>", unsafe_allow_html=True)
+    # Add a section for app info
+    st.markdown("## About This App")
+    st.info("""
+    This AI-powered dialogue summarizer uses advanced natural language processing to distill the key points from conversations.
+    It's perfect for quickly understanding the essence of meetings, chats, or any form of dialogue.
+    **How to use:**
+    1. Paste your dialogue in the text area on the left.
+    2. Click the 'Generate Summary' button.
+    3. View the AI-generated summary on the right.
+    For best results, ensure your input is a clear dialogue or conversation.
+    """)
+if __name__ == "__main__":
+    main()

config/config.yaml CHANGED Viewed

@@ -24,4 +24,13 @@ data_transformation:
 model_trainer:
   root_dir: artifacts/model_trainer
   data_path: artifacts/data_transformation/samsum_dataset
-  model_ckpt: google/pegasus-cnn_dailymail

 model_trainer:
   root_dir: artifacts/model_trainer
   data_path: artifacts/data_transformation/samsum_dataset
+  model_ckpt: google/pegasus-cnn_dailymail
+model_evaluation:
+  root_dir: artifacts/model_evaluation
+  data_path: artifacts/data_transformation/samsum_dataset
+  model_path: artifacts/model_trainer/pegasus-samsum-model
+  tokenizer_path: artifacts/model_trainer/tokenizer
+  metric_file_name: artifacts/model_evaluation/metrics.json

main.py CHANGED Viewed

@@ -2,6 +2,7 @@ from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipelin
 from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
 from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
 from textsummarizer.pipeline.stage_04_model_trainer import ModelTrainerPipeline
 from textsummarizer.logging import logger
 STAGE_NAME = "Data Ingestion stage"
@@ -38,12 +39,24 @@ except Exception as e:
-STAGE_NAME = "Data Traniner stage"
 try:
    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
    model_tranier = ModelTrainerPipeline()
    model_tranier.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 except Exception as e:
         logger.exception(e)
         raise e

 from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
 from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
 from textsummarizer.pipeline.stage_04_model_trainer import ModelTrainerPipeline
+from textsummarizer.pipeline.stage_05_model_evaluation import ModelEvaluationPipeline
 from textsummarizer.logging import logger
 STAGE_NAME = "Data Ingestion stage"
+STAGE_NAME = "Model Traniner stage"
 try:
    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
    model_tranier = ModelTrainerPipeline()
    model_tranier.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+except Exception as e:
+        logger.exception(e)
+        raise e
+STAGE_NAME = "Model Evaluation stage"
+try:
+   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+   model_evaluation = ModelEvaluationPipeline()
+   model_evaluation.main()
+   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 except Exception as e:
         logger.exception(e)
         raise e

requirements.txt CHANGED Viewed

@@ -18,4 +18,4 @@ ensure==1.0.2
 fastapi==0.78.0
 uvicorn==0.18.3
 Jinja2==3.1.2
--e .

 fastapi==0.78.0
 uvicorn==0.18.3
 Jinja2==3.1.2

research/model_evaluatoin.ipynb ADDED Viewed

	@@ -0,0 +1,300 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import  os\n",
+    "os.chdir('../')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "@dataclass(frozen=True)\n",
+    "class ModelEvaluationConfig:\n",
+    "    root_dir : Path\n",
+    "    data_path : Path\n",
+    "    model_path : Path\n",
+    "    all_params: dict\n",
+    "    tokenizer_path : Path\n",
+    "    metric_file_name : Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from textsummarizer.constants import *\n",
+    "from textsummarizer.utils.common import read_yaml, create_directories, save_json, load_json\n",
+    "\n",
+    "class ConfigurationManager:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        config_filepath = CONFIG_FILE_PATH,\n",
+    "        params_filepath = PARAMS_FILE_PATH):\n",
+    "\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
+    "        config = self.config.model_evaluation\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        model_evaluation_config = ModelEvaluationConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            data_path=config.data_path,\n",
+    "            model_path = config.model_path,\n",
+    "            tokenizer_path = config.tokenizer_path,\n",
+    "            metric_file_name = config.metric_file_name\n",
+    "           \n",
+    "        )\n",
+    "\n",
+    "        return model_evaluation_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 20:23:00,587: INFO: config: PyTorch version 2.2.2+cu121 available.]\n",
+      "[2024-08-11 20:23:00,589: INFO: config: TensorFlow version 2.12.0 available.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk, load_metric\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlflow\n",
+    "import dagshub\n",
+    "import json\n",
+    "\n",
+    "class ModelEvaluation:\n",
+    "    def __init__(self, config: ModelEvaluationConfig):\n",
+    "        self.config = config\n",
+    "\n",
+    "    def generate_batch_sized_chunks(self, list_of_elements, batch_size):\n",
+    "        \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
+    "        Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
+    "        for i in range(0, len(list_of_elements), batch_size):\n",
+    "            yield list_of_elements[i : i + batch_size]\n",
+    "\n",
+    "    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, \n",
+    "                               batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n",
+    "                               column_text=\"article\", \n",
+    "                               column_summary=\"highlights\"):\n",
+    "        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
+    "        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
+    "\n",
+    "        for article_batch, target_batch in tqdm(\n",
+    "            zip(article_batches, target_batches), total=len(article_batches)):\n",
+    "            \n",
+    "            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, \n",
+    "                            padding=\"max_length\", return_tensors=\"pt\")\n",
+    "            \n",
+    "            summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
+    "                            attention_mask=inputs[\"attention_mask\"].to(device), \n",
+    "                            length_penalty=0.8, num_beams=8, max_length=128)\n",
+    "            \n",
+    "            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
+    "                                    clean_up_tokenization_spaces=True) \n",
+    "                for s in summaries]      \n",
+    "            \n",
+    "            decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
+    "            \n",
+    "            metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
+    "            \n",
+    "        score = metric.compute()\n",
+    "        return score\n",
+    "\n",
+    "    def evaluate(self):\n",
+    "        # Set up MLflow tracking\n",
+    "        dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)\n",
+    "        mlflow.set_tracking_uri(\"https://dagshub.com/azizulhakim8291/text-summarization.mlflow\")\n",
+    "        mlflow.set_experiment(\"text-summarization-evaluation\")\n",
+    "\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
+    "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
+    "       \n",
+    "        dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
+    "\n",
+    "        rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
+    "        rouge_metric = load_metric('rouge')\n",
+    "\n",
+    "        with mlflow.start_run():\n",
+    "            mlflow.log_param(\"model_name\", \"pegasus\")\n",
+    "            mlflow.log_param(\"dataset\", \"samsum\")\n",
+    "\n",
+    "            score = self.calculate_metric_on_test_ds(\n",
+    "                dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
+    "                batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
+    "            )\n",
+    "\n",
+    "            rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
+    "\n",
+    "            # Log metrics to MLflow\n",
+    "            for rouge_name, rouge_score in rouge_dict.items():\n",
+    "                mlflow.log_metric(rouge_name, rouge_score)\n",
+    "\n",
+    "            # Save results as JSON\n",
+    "            with open(self.config.metric_file_name, 'w') as f:\n",
+    "                json.dump(rouge_dict, f, indent=4)\n",
+    "\n",
+    "            # Log the JSON file as an artifact\n",
+    "            mlflow.log_artifact(self.config.metric_file_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 22:27:18,954: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-08-11 22:27:18,967: INFO: common: yaml file: params.yaml loaded successfully]\n",
+      "[2024-08-11 22:27:18,971: INFO: common: created directory at: artifacts]\n",
+      "[2024-08-11 22:27:18,973: INFO: common: created directory at: artifacts/model_evaluation]\n",
+      "[2024-08-11 22:27:19,619: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Initialized MLflow to track repo <span style=\"color: #008000; text-decoration-color: #008000\">\"azizulhakim8291/text-summarization\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Initialized MLflow to track repo \u001b[32m\"azizulhakim8291/text-summarization\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 22:27:20,037: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Repository azizulhakim8291/text-summarization initialized!\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Repository azizulhakim8291/text-summarization initialized!\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 22:27:20,040: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
+      "[2024-08-11 22:27:20,119: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:756: FutureWarning: The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/rouge/rouge.py\n",
+      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
+      "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
+      "  warnings.warn(\n",
+      "100%|██████████| 5/5 [00:21<00:00,  4.26s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-08-11 22:28:20,351: INFO: rouge_scorer: Using default tokenizer.]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    model_evaluation_config = config.get_model_evaluation_config()\n",
+    "    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
+    "    model_evaluation_config.evaluate()\n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/model_trainer.ipynb CHANGED Viewed

	@@ -0,0 +1,212 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.chdir('../')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'c:\\\\mlops projects\\\\text-summarization'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "@dataclass(frozen=True)\n",
+    "class ModelTrainerConfig:\n",
+    "    root_dir : Path\n",
+    "    data_path : Path\n",
+    "    model_ckpt  : Path\n",
+    "    num_train_epochs : int\n",
+    "    warmup_steps : int\n",
+    "    per_device_train_batch_size : int\n",
+    "    weight_decay : float\n",
+    "    logging_steps : int\n",
+    "    evaluation_strategy: str\n",
+    "    eval_steps: int\n",
+    "    save_steps: float\n",
+    "    gradient_accumulation_steps: int"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from textsummarizer.constants import *\n",
+    "from textsummarizer.utils.common import read_yaml,create_directories\n",
+    "\n",
+    "class ConfigurationManager:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        config_filepath = CONFIG_FILE_PATH,\n",
+    "        params_filepath = PARAMS_FILE_PATH):\n",
+    "\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "        \n",
+    "        \n",
+    "    def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
+    "        config = self.config.model_trainer\n",
+    "        params = self.params.TrainingArguments\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "        \n",
+    "        \n",
+    "        model_trainer_config = ModelTrainerConfig(\n",
+    "            root_dir  = config.root_dir,\n",
+    "            data_path = config.data_path,\n",
+    "            model_ckpt = config.model_ckpt,\n",
+    "            num_train_epochs =params.num_train_epochs,\n",
+    "            warmup_steps =params.warmup_steps,\n",
+    "            per_device_train_batch_size = params.per_device_train_batch_size,\n",
+    "            weight_decay = params.weight_decay,\n",
+    "            logging_steps = params.logging_steps,\n",
+    "            evaluation_strategy =params.evaluation_strategy,\n",
+    "            eval_steps =params.eval_steps,\n",
+    "            save_steps =  params.save_steps,\n",
+    "            gradient_accumulation_steps = params.gradient_accumulation_steps\n",
+    "        )\n",
+    "        \n",
+    "        return model_trainer_config\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "from transformers import DataCollatorForSeq2Seq\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk\n",
+    "import torch\n",
+    "import  os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ModelTrainer:\n",
+    "    def __init__(self, config : ModelTrainerConfig):\n",
+    "        self.config = config\n",
+    "        os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+    "        \n",
+    "        \n",
+    "    def train(self):\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
+    "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
+    "        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
+    "        \n",
+    "        #loading data \n",
+    "        dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
+    "        \n",
+    "        \n",
+    "        trainer_args = TrainingArguments(\n",
+    "            output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n",
+    "            per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
+    "            weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n",
+    "            evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n",
+    "            gradient_accumulation_steps=self.config.gradient_accumulation_steps,\n",
+    "            report_to=\"none\"\n",
+    "            \n",
+    "        ) \n",
+    "        \n",
+    "        \n",
+    "        trainer = Trainer(model=model_pegasus, args=trainer_args,\n",
+    "                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,\n",
+    "                  train_dataset=dataset_samsum_pt[\"test\"], \n",
+    "                  eval_dataset=dataset_samsum_pt[\"validation\"])\n",
+    "        \n",
+    "        \n",
+    "        trainer.train()\n",
+    "\n",
+    "        ## Save model\n",
+    "        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-samsum-model\"))\n",
+    "        ## Save tokenizer\n",
+    "        tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))\n",
+    "        \n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    model_trainer_config = config.get_model_trainer_config()\n",
+    "    model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
+    "    model_trainer_config.train()\n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/textsummarizer/config/configuration.py CHANGED Viewed

@@ -3,7 +3,8 @@ from textsummarizer.utils.common import read_yaml, create_directories
 from textsummarizer.entity.config_entity import (DataIngestionConfig,
                                                  DataValidationConfig,
                                                  DataTransformationConfig,
-                                                 ModelTrainerConfig)
 class ConfigurationManager:
     def __init__(
@@ -84,5 +85,24 @@ class ConfigurationManager:
         )
         return model_trainer_config

 from textsummarizer.entity.config_entity import (DataIngestionConfig,
                                                  DataValidationConfig,
                                                  DataTransformationConfig,
+                                                 ModelTrainerConfig,
+                                                 ModelEvaluationConfig)
 class ConfigurationManager:
     def __init__(
         )
         return model_trainer_config
+    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
+        config = self.config.model_evaluation
+        params = self.params.TrainingArguments
+        create_directories([config.root_dir])
+        model_evaluation_config = ModelEvaluationConfig(
+            root_dir=config.root_dir,
+            data_path=config.data_path,
+            model_path = config.model_path,
+            tokenizer_path = config.tokenizer_path,
+            metric_file_name = config.metric_file_name,
+            all_params = params
+        )
+        return model_evaluation_config

src/textsummarizer/conponents/model_evaluation.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from datasets import load_dataset, load_from_disk, load_metric
+import torch
+import pandas as pd
+from tqdm import tqdm
+from textsummarizer.entity.config_entity import ModelEvaluationConfig
+import mlflow
+import dagshub
+import json
+class ModelEvaluation:
+    def __init__(self, config: ModelEvaluationConfig):
+        self.config = config
+    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
+        """split the dataset into smaller batches that we can process simultaneously
+        Yield successive batch-sized chunks from list_of_elements."""
+        for i in range(0, len(list_of_elements), batch_size):
+            yield list_of_elements[i : i + batch_size]
+    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
+                               batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
+                               column_text="article",
+                               column_summary="highlights"):
+        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
+        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))
+        for article_batch, target_batch in tqdm(
+            zip(article_batches, target_batches), total=len(article_batches)):
+            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
+                            padding="max_length", return_tensors="pt")
+            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
+                            attention_mask=inputs["attention_mask"].to(device),
+                            length_penalty=0.8, num_beams=8, max_length=128)
+            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
+                                    clean_up_tokenization_spaces=True)
+                for s in summaries]
+            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
+            metric.add_batch(predictions=decoded_summaries, references=target_batch)
+        score = metric.compute()
+        return score
+    def evaluate(self):
+        # Set up MLflow tracking
+        dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)
+        mlflow.set_tracking_uri("https://dagshub.com/azizulhakim8291/text-summarization.mlflow")
+        mlflow.set_experiment("text-summarization-evaluation")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
+        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
+        dataset_samsum_pt = load_from_disk(self.config.data_path)
+        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+        rouge_metric = load_metric('rouge')
+        with mlflow.start_run():
+            mlflow.log_param("model_name", "pegasus")
+            mlflow.log_param("dataset", "samsum")
+            mlflow.log_param('parameter name', 'value')
+            score = self.calculate_metric_on_test_ds(
+                dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer,
+                batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
+            )
+            rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
+            mlflow.log_params(self.config.all_params)
+            # Log metrics to MLflow
+            for rouge_name, rouge_score in rouge_dict.items():
+                mlflow.log_metric(rouge_name, rouge_score)
+            # Save results as JSON
+            with open(self.config.metric_file_name, 'w') as f:
+                json.dump(rouge_dict, f, indent=4)
+            # Log the JSON file as an artifact
+            mlflow.log_artifact(self.config.metric_file_name)

src/textsummarizer/entity/config_entity.py CHANGED Viewed

@@ -38,3 +38,13 @@ class ModelTrainerConfig:
     eval_steps: int
     save_steps: float
     gradient_accumulation_steps: int

     eval_steps: int
     save_steps: float
     gradient_accumulation_steps: int
+@dataclass(frozen=True)
+class ModelEvaluationConfig:
+    root_dir : Path
+    data_path : Path
+    model_path : Path
+    all_params: dict
+    tokenizer_path : Path
+    metric_file_name : Path

src/textsummarizer/pipeline/predict.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from textsummarizer.config.configuration import ConfigurationManager
+from transformers import AutoTokenizer
+from transformers import pipeline
+class PredictionPipeline:
+    def __init__(self):
+        self.config = ConfigurationManager().get_model_evaluation_config()
+    def predict(self,text):
+        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
+        gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
+        pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer)
+        print("Dialogue:")
+        print(text)
+        output = pipe(text, **gen_kwargs)[0]["summary_text"]
+        print("\nModel Summary:")
+        print(output)
+        return output

src/textsummarizer/pipeline/stage_05_model_evaluation.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from textsummarizer.conponents.model_evaluation import ModelEvaluation
+from textsummarizer.config.configuration import ConfigurationManager
+class ModelEvaluationPipeline:
+    def __init__(self):
+        pass
+    def main(self):
+        config = ConfigurationManager()
+        model_evaluation_config = config.get_model_evaluation_config()
+        model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
+        model_evaluation_config.evaluate()

src/textsummarizer/utils/common.py CHANGED Viewed

@@ -6,7 +6,7 @@ from ensure import ensure_annotations
 from box import ConfigBox
 from pathlib import Path
 from typing import Any
 @ensure_annotations
@@ -63,4 +63,39 @@ def get_size(path: Path) -> str:
     size_in_kb = round(os.path.getsize(path)/1024)
     return f"~ {size_in_kb} KB"

 from box import ConfigBox
 from pathlib import Path
 from typing import Any
+import json
 @ensure_annotations
     size_in_kb = round(os.path.getsize(path)/1024)
     return f"~ {size_in_kb} KB"
+@ensure_annotations
+def save_json(path: Path, data: dict):
+    """save json data
+    Args:
+        path (Path): path to json file
+        data (dict): data to be saved in json file
+    """
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=4)
+    logger.info(f'Json file saved at: {path}')
+@ensure_annotations
+def load_json(path: Path) -> ConfigBox:
+    """load json files data
+    Args:
+        path (Path): path to json file
+    Returns:
+        ConfigBox: data as class attributes instead of dict
+    """
+    with open(path, 'r') as f:
+        content = json.load(f)
+    logger.info(f"Json file loaded successfully from: {path}")
+    return ConfigBox