{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 4 documents.\n" ] } ], "source": [ "from llama_index.core import SimpleDirectoryReader\n", "documents = SimpleDirectoryReader(r\"C:\\Users\\agshi\\Downloads\\output_chunks\", filename_as_id=True).load_data()\n", "print(f\"Loaded {len(documents)} documents.\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms.ollama import Ollama\n", "\n", "llm = Ollama(model=\"llama3:8b\", request_timeout=120.0)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from llama_index.core import SimpleDirectoryReader\n", "from llama_index.core.llama_dataset.generator import RagDatasetGenerator\n", "from llama_index.llms.gemini import Gemini\n", "import os\n", "from llama_index.core import Settings\n", "from llama_index.llms.groq import Groq\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "# create llm\n", "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n", "\n", "def question_dataset_generator(document):\n", " dataset_generator = RagDatasetGenerator.from_documents(\n", " documents=document,\n", " llm=llm,\n", " num_questions_per_chunk=2,\n", " show_progress=True # set the number of qu/estions per nodes\n", " )\n", "\n", " rag_dataset = dataset_generator.generate_questions_from_nodes()\n", " question = [e.query for e in rag_dataset.examples]\n", " return question" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "122068b898824caabf36017c8ef57733", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Parsing nodes: 0%| | 0/2 [00:00here for more info. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "import pickle\n", "\n", "# Path to the pickle file\n", "pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n", "\n", "# Load the question list from the pickle file\n", "with open(pickle_file_path, 'rb') as file:\n", " loaded_questions = pickle.load(file)\n", "\n", "print(\"Loaded questions:\", loaded_questions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.core.evaluation import BatchEvalRunner\n", "from llama_index.core.evaluation import RelevancyEvaluator\n", "from llama_index.core.evaluation import FaithfulnessEvaluator\n", "from llama_index.llms.groq import Groq\n", "import os\n", "from llama_index.llms.gemini import Gemini\n", "\n", "os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyClIR8gLfV7DhuF8idI8BG6PuGLdEo2tIM\"\n", "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n", "relevancy_evaluator = RelevancyEvaluator(llm=llm)\n", "faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def extract_elements(eval_result):\n", " # Dictionary to store the extracted elements\n", " extracted_data = {\n", " \"contexts\": [eval_result.contexts],\n", " \"response\": [eval_result.response],\n", " \"passing\": [eval_result.passing],\n", " \"feedback\": [eval_result.feedback],\n", " \"score\": [eval_result.score],\n", " \"pairwise_source\": [eval_result.pairwise_source],\n", " \"invalid_result\": [eval_result.invalid_result],\n", " \"invalid_reason\": [eval_result.invalid_reason]\n", " }\n", "\n", " # Convert the dictionary into a DataFrame\n", " df = pd.DataFrame(extracted_data)\n", " return df" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'query_engine' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[11], line 8\u001b[0m\n\u001b[0;32m 6\u001b[0m llm \u001b[38;5;241m=\u001b[39m Gemini(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels/gemini-pro\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m question \u001b[38;5;129;01min\u001b[39;00m questions:\n\u001b[1;32m----> 8\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mquery_engine\u001b[49m\u001b[38;5;241m.\u001b[39mquery(question)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;66;03m# Evaluate faithfulness\u001b[39;00m\n\u001b[0;32m 11\u001b[0m eval_result \u001b[38;5;241m=\u001b[39m faithfulness_evaluator\u001b[38;5;241m.\u001b[39mevaluate_response(query\u001b[38;5;241m=\u001b[39mquestion, response\u001b[38;5;241m=\u001b[39mresponse)\n", "\u001b[1;31mNameError\u001b[0m: name 'query_engine' is not defined" ] } ], "source": [ "import time\n", "import pandas as pd\n", "faithfulness_df = pd.DataFrame()\n", "relevancy_df = pd.DataFrame()\n", "os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyC_UnbyMmhvklBRyjLvdEWXuhXim_BX0fk\"\n", "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n", "for question in questions:\n", " response = query_engine.query(question)\n", "\n", " # Evaluate faithfulness\n", " eval_result = faithfulness_evaluator.evaluate_response(query=question, response=response)\n", " faithfulness_elements = extract_elements(eval_result)\n", " faithfulness_df = pd.concat([faithfulness_df, faithfulness_elements], ignore_index=True)\n", "\n", " # Evaluate relevancy\n", " eval_result = relevancy_evaluator.evaluate_response( query=question, response=response)\n", " relevancy_elements = extract_elements(eval_result)\n", " relevancy_df = pd.concat([relevancy_df,relevancy_elements], ignore_index=True)\n", "\n", " time.sleep(60)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Content split into 21 chunks.\n", "Chunk 1 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_1.txt\n", "Chunk 2 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_2.txt\n", "Chunk 3 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_3.txt\n", "Chunk 4 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_4.txt\n", "Chunk 5 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_5.txt\n", "Chunk 6 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_6.txt\n", "Chunk 7 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_7.txt\n", "Chunk 8 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_8.txt\n", "Chunk 9 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_9.txt\n", "Chunk 10 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_10.txt\n", "Chunk 11 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_11.txt\n", "Chunk 12 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_12.txt\n", "Chunk 13 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_13.txt\n", "Chunk 14 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_14.txt\n", "Chunk 15 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_15.txt\n", "Chunk 16 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_16.txt\n", "Chunk 17 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_17.txt\n", "Chunk 18 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_18.txt\n", "Chunk 19 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_19.txt\n", "Chunk 20 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_20.txt\n", "Chunk 21 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_21.txt\n" ] } ], "source": [ "import os\n", "\n", "# Step 1: Read the content of the text file\n", "input_file_path = r\"C:\\Users\\agshi\\Downloads\\discussion_data-20240911T083316Z-001\\discussion_data\\C-2_4.txt\"\n", "with open(input_file_path, 'r', encoding='utf-8') as file:\n", " content = file.read()\n", "\n", "# Step 2: Split the content into chunks based on a defined chunk size\n", "chunk_size = 5000 # Adjust chunk size as needed\n", "chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]\n", "\n", "print(f\"Content split into {len(chunks)} chunks.\")\n", "\n", "# Step 3: Save each chunk into a new file in the output folder\n", "output_folder = r'C:\\Users\\agshi\\Downloads\\output_chunks'\n", "os.makedirs(output_folder, exist_ok=True)\n", "\n", "# Extract the base name of the input file (without extension)\n", "file_base_name = os.path.splitext(os.path.basename(input_file_path))[0]\n", "\n", "# Save each chunk with the file name and chunk number\n", "for i, chunk in enumerate(chunks):\n", " output_file_path = os.path.join(output_folder, f\"{file_base_name}_chunk_{i+1}.txt\")\n", " \n", " # Write each chunk into a separate file\n", " with open(output_file_path, 'w', encoding='utf-8') as output_file:\n", " output_file.write(chunk)\n", " \n", " print(f\"Chunk {i+1} saved as: {output_file_path}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }