asach
/

amigov1

Model card Files Files and versions Community

File size: 28,417 Bytes

d727a17

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_8024/106390013.py:10: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from tqdm.autonotebook import tqdm\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "sys.path.append(\"..\")\n",
    "\n",
    "import re\n",
    "import json\n",
    "import fire\n",
    "import string\n",
    "\n",
    "from tqdm.autonotebook import tqdm\n",
    "from medalpaca.inferer import Inferer\n",
    "\n",
    "\n",
    "greedy_search = {\n",
    "    \"num_beams\" : 1, \n",
    "    \"do_sample\" : False,\n",
    "    \"max_new_tokens\" : 128, \n",
    "    \"early_stopping\" : False\n",
    "}\n",
    "\n",
    "beam_serach = {\n",
    "    \"num_beams\" : 4, \n",
    "    \"do_sample\" : False,\n",
    "    \"max_new_tokens\" : 128, \n",
    "    \"early_stopping\" : True,\n",
    "}\n",
    "\n",
    "sampling_top_k = {\n",
    "    \"do_sample\" : True,\n",
    "    \"num_beams\": 1,\n",
    "    \"max_new_tokens\": 128, \n",
    "    \"early_stopping\": True,\n",
    "    \"temperature\": 0.7,\n",
    "    \"top_k\": 50\n",
    "}\n",
    "\n",
    "sampling_top_p = {\n",
    "    \"do_sample\" : True,\n",
    "    \"top_k\": 0, \n",
    "    \"num_beams\": 1,\n",
    "    \"max_new_tokens\": 128, \n",
    "    \"early_stopping\": True,\n",
    "    \"temperature\": 0.7,\n",
    "    \"top_p\": 0.9\n",
    "}\n",
    "\n",
    "sampling = {\n",
    "    \"do_sample\" : True,\n",
    "    \"top_k\": 50, \n",
    "    \"num_beams\": 1,\n",
    "    \"max_new_tokens\": 128, \n",
    "    \"early_stopping\": True,\n",
    "    \"temperature\": 0.4,\n",
    "    \"top_p\": 0.9\n",
    "}\n",
    "\n",
    "\n",
    "def format_question(d): \n",
    "    question = d[\"question\"]\n",
    "    options = d[\"options\"]\n",
    "    for k, v in options.items(): \n",
    "        question += f\"\\n{k}: {v}\"\n",
    "    return question\n",
    "\n",
    "\n",
    "def strip_special_chars(input_str):\n",
    "    \"Remove special characters from string start/end\"\n",
    "    if not input_str:\n",
    "        return input_str\n",
    "    \n",
    "    start_index = 0\n",
    "    end_index = len(input_str) - 1\n",
    "\n",
    "    while start_index < len(input_str) and input_str[start_index] not in string.ascii_letters + string.digits:\n",
    "        start_index += 1\n",
    "\n",
    "    while end_index >= 0 and input_str[end_index] not in string.ascii_letters + string.digits:\n",
    "        end_index -= 1\n",
    "\n",
    "    if start_index <= end_index:\n",
    "        return input_str[start_index:end_index + 1]\n",
    "    else:\n",
    "        return \"\"\n",
    "\n",
    "def starts_with_capital_letter(input_str):\n",
    "    \"\"\"\n",
    "    The answers should start like this: \n",
    "        'A: '\n",
    "        'A. '\n",
    "        'A '\n",
    "    \"\"\"\n",
    "    pattern = r'^[A-Z](:|\\.|) .+'\n",
    "    return bool(re.match(pattern, input_str))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model_name: str, # \"medalpaca/medalpaca-lora-13b-8bit\", \n",
    "# prompt_template: str, # \"../medalpaca/prompt_templates/medalpaca.json\", \n",
    "# base_model: str, # \"decapoda-research/llama-13b-hf\",\n",
    "# peft: bool, # True,\n",
    "# load_in_8bit: bool, # True\n",
    "# path_to_exams: str, # eval/data/test/\n",
    "# ntries: int = 5, \n",
    "# skip_if_exists: bool = True,\n",
    "\n",
    "\n",
    "# model = Inferer(\n",
    "#     model_name='medalpaca/medalpaca-7b',\n",
    "#     prompt_template=\"../medalpaca/prompt_templates/medalpaca.json\",\n",
    "#     base_model='decapoda-research/llama-7b-hf',\n",
    "#     peft=True,\n",
    "#     load_in_8bit=False,\n",
    "# ) \n",
    "    \n",
    "\n",
    "from transformers import pipeline\n",
    "\n",
    "pl = pipeline(\"text-generation\", model=\"medalpaca/medalpaca-7b\", tokenizer=\"medalpaca/medalpaca-7b\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
      "  warnings.warn(\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]\n",
      "Downloading adapter_model.bin: 100%|██████████| 33.6M/33.6M [00:01<00:00, 23.9MB/s]\n",
      "Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.\n"
     ]
    }
   ],
   "source": [
    "from langchain.llms import HuggingFacePipeline\n",
    "\n",
    "llm = HuggingFacePipeline.from_model_id(\n",
    "    model_id=\"Ali-C137/Llama-2-7b-chat-hf-tuned-medical-chat\",\n",
    "    task=\"text-generation\",\n",
    "    model_kwargs={\"temperature\": 0, \"max_length\": 64}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm(\"hello\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What are the symptoms of diabetes?\"\n",
    "# context = \"Diabetes is a metabolic disease that causes high blood sugar. The symptoms include increased thirst, frequent urination, and unexplained weight loss.\"\n",
    "answer = pl(question,max_length=200)\n",
    "print(answer[0]['generated_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'generated_text': 'What are the symptoms of diabetes?\\nDiabetes is a disease in which your'}]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_to_exams = '/home/ubuntu/LLM/.conda/om/medAlpaca/data_clean/questions/US'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100\n"
     ]
    }
   ],
   "source": [
    "with open(os.path.join(path_to_exams, f\"test.jsonl\")) as f:\n",
    "    questions = [json.loads(line) for line in f]\n",
    "    # print(questions)\n",
    "\n",
    "outname = os.path.join(path_to_exams, f\"ouput.json\")\n",
    "if os.path.exists(outname): \n",
    "    with open(outname, \"r\") as fp:\n",
    "        answers = json.load(fp)\n",
    "else: \n",
    "    answers = []\n",
    "    \n",
    "print(len(questions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/100 [00:00<?, ?it/s]/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
      "  warnings.warn(\n",
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n",
      "  warnings.warn(\n",
      "  0%|          | 0/100 [01:29<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C: Tell the attending that he cannot fail to disclose this mistake.\n",
      "\n",
      "### Discussion:\n",
      "This is an ethi\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for question in tqdm(questions):\n",
    "    print(format_question(question))\n",
    "    n = 0\n",
    "    response = model(\n",
    "        instruction=\"Answer this multiple choice question.\", \n",
    "        input=format_question(question), \n",
    "        output=\"The Answer to the question is:\",\n",
    "        **sampling\n",
    "    )\n",
    "    response = strip_special_chars(response)\n",
    "    print(response[:100])\n",
    "    if starts_with_capital_letter(response): \n",
    "        n += 1\n",
    "        break\n",
    "    else: \n",
    "        print(f\"Output not satisfactoy, retrying {n} times\")\n",
    "    question[\"answer\"] = response\n",
    "    answers.append(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "   with open(outname, \"w+\") as fp:\n",
    "        json.dump(answers, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
      "  warnings.warn(\n",
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "response = model(\n",
    "        instruction=\"hello.\", \n",
    "        input=format_question(question), \n",
    "        output=\"The Answer to the question is:\",\n",
    "        **sampling\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/miniconda3/envs/med/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.22s/it]\n",
      "Downloading (…)neration_config.json: 100%|██████████| 284/284 [00:00<00:00, 1.46MB/s]\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from transformers.generation.utils import GenerationConfig\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM\", use_fast=False, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(\"/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM\", device_map=\"auto\", torch_dtype=torch.float16, trust_remote_code=True)\n",
    "model.generation_config = GenerationConfig.from_pretrained(\"Flmc/DISC-MedLLM\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = []\n",
    "messages.append({\"role\": \"user\", \"content\": \"Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge. Next you will talk with the paitent\"})\n",
    "response = model.chat(tokenizer, messages)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Yes of course! Please feel free to tell me about yourself so that I may better assist you\n"
     ]
    }
   ],
   "source": [
    "messages.append({\"role\": \"user\", \"content\": f\" Hello I am Om, can i ask you questions\"})\n",
    "response = model.chat(tokenizer, messages)\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# report_data = \"my recent cholesterol levels from a lab report. Their total cholesterol is 200 mg/dL, HDL cholesterol is 50 mg/dL, and LDL cholesterol is 130 mg/dL.\"\n",
    "report_data = \"None\"\n",
    "question = \"i am really worried about my high cholesterol levels. what should i do and what does it indicate?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages.append({\"role\": \"user\", \"content\": f\" Detials {report_data} : & User Question {question}\"})\n",
    "response = model.chat(tokenizer, messages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Cholesterol is a fatty substance that circulates throughout our bloodstreams as part of cell membranes within cells or lipoproteins (plasma proteins) called LDL-cholesterol which are found floating freely around inside plasma membrane bounded by an apo B100 protein molecule attached at one end via its phospholipid bilayer structure . It's essential for maintaining healthy brain function , nerve conduction pathways between neurons along nerves from sensory organs such as taste receptors located primarily underneath tongue epithelium into ganglia situated deep beneath dura mater covering cranial bones forming pa of basilar papilla whereby signals travel downwards towards spinal cord terminating eventually upon synapse connections formed amongst axonal branches projecting outwardly onto muscle fibers resulting ultimately leading upstairs back again all over body so we don't get tired when standing still but instead continue moving forward like this indefinitely without any fatigue feeling\\nCholesterol also plays important roles during embryonic development especially those involved involving neural crest derivatives including adrenal medulla derived chromaffin tissue responsible for producing catecholamines necessary not only just simply controlling heart rate itself per se plus helping regulate cardiac output volume etcetera ; liver sinusoid hepatocytes synthesizing vitamin A retinoic acid; skin keratinization process occurring mainly due to epidermis formation among many others\""
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import langchain\n",
    "import sqlite3\n",
    "from langchain.document_loaders import PyPDFLoader  \n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.llms import OpenAI\n",
    "from langchain.chains import ConversationalRetrievalChain,RetrievalQA\n",
    "from langchain.document_loaders import UnstructuredPDFLoader\n",
    "import openai\n",
    "import os\n",
    "import PyPDF2\n",
    "from langchain.document_loaders.csv_loader import CSVLoader\n",
    "from langchain import OpenAI, PromptTemplate\n",
    "from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader\n",
    "import logging\n",
    "from tqdm import tqdm\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
    "from langchain.chains.summarize import load_summarize_chain\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "import pandas as pd\n",
    "import uuid\n",
    "from PIL import Image\n",
    "\n",
    "# from utils import get_completion,model_info,model_load\n",
    "\n",
    "import pytesseract\n",
    "\n",
    "def get_text_img(path):\n",
    "    return pytesseract.image_to_string(Image.open(path)).replace(\"\\n\", \" \")\n",
    "\n",
    "logging.basicConfig()\n",
    "logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)\n",
    "\n",
    "base_path = os.path.join(os.getcwd(),\"db\")\n",
    "key_openai =\"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\"\n",
    "embedding = OpenAIEmbeddings(openai_api_key =key_openai)\n",
    "\n",
    "# import torch\n",
    "# from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "# from transformers.generation.utils import GenerationConfig\n",
    "# tokenizer = AutoTokenizer.from_pretrained(\"/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM\", use_fast=False, trust_remote_code=True)\n",
    "# model = AutoModelForCausalLM.from_pretrained(\"/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM\", device_map=\"auto\", torch_dtype=torch.float16, trust_remote_code=True)\n",
    "# model.generation_config = GenerationConfig.from_pretrained(\"Flmc/DISC-MedLLM\")\n",
    "\n",
    "data_llm_16k = ChatOpenAI(\n",
    "        model_name=\"gpt-3.5-turbo-16k\",\n",
    "        temperature = 0,\n",
    "        openai_api_key=key_openai,\n",
    "    )\n",
    "\n",
    "data_llm = ChatOpenAI(\n",
    "        model_name=\"gpt-3.5-turbo\",\n",
    "        temperature = 0,\n",
    "        openai_api_key=key_openai,\n",
    "    )\n",
    "\n",
    "chain = load_summarize_chain(data_llm_16k, chain_type=\"stuff\")\n",
    "\n",
    "def get_qa_chain_answers_llm(question,email):\n",
    "    title = str(email)\n",
    "    persist_directory = os.path.join(base_path,title)\n",
    "    db = Chroma(persist_directory=persist_directory, embedding_function=embedding)\n",
    "    k_tops = db.similarity_search(question, k=3)\n",
    "    print(k_tops)\n",
    "    #question_new = f\" 'context' {k_tops}: '{question}'\"\n",
    "    #res = get_completion(question_new, 300, 0)\n",
    "    print(\"LLM MODEL------------------------------\")\n",
    "    messages = []\n",
    "    messages.append({\"role\": \"user\", \"content\": \"Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge. Next you will talk with the paitent\"})\n",
    "    model.chat(tokenizer, messages)\n",
    "    messages.append({\"role\": \"user\", \"content\": f\" Detials {k_tops} : & User Question {question}\"})\n",
    "    return model.chat(tokenizer, messages)\n",
    "\n",
    "# def get_qa_chain_answers(question,email,history=[]):\n",
    "#     title = str(email)\n",
    "#     persist_directory = os.path.join(base_path,title)\n",
    "#     db = Chroma(persist_directory=persist_directory, embedding_function=embedding)\n",
    "    \n",
    "#     # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm)\n",
    "#     # unique_docs = retriever_from_llm.get_relevant_documents(query=question)\n",
    "\n",
    "#     qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever())\n",
    "#     question_updated = \"Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked\" + question\n",
    "#     answers = qa_chain({\"query\": question_updated})\n",
    "#     return answers['result']\n",
    "  \n",
    "def get_text(doc,file_name):\n",
    "    file_extension = os.path.splitext(file_name)[1].lower()\n",
    "    print(file_extension)\n",
    "    if file_extension == \".pdf\":\n",
    "        pdf = PyPDF2.PdfReader(doc)\n",
    "        pdf_text = \"\"\n",
    "        for page in pdf.pages:\n",
    "            pdf_text += page.extract_text()\n",
    "        return pdf_text\n",
    "        \n",
    "    elif file_extension == \".md\" or file_extension == \".txt\":\n",
    "        loader = TextLoader(doc)\n",
    "    elif file_extension in [\".docx\", \".doc\"]:\n",
    "        loader = Docx2txtLoader(doc)\n",
    "    elif file_extension == \".csv\":\n",
    "        loader = CSVLoader(file_path=doc)\n",
    "    elif file_extension in [\".xls\", \".xlsx\"]:\n",
    "        try:\n",
    "            df = pd.read_excel(doc, engine='openpyxl')\n",
    "            file_name = f\"{str(uuid.uuid1())}.csv\"\n",
    "            df.to_csv(file_name)\n",
    "            loader = CSVLoader(file_path=file_name)\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            loader = UnstructuredExcelLoader(doc, mode=\"elements\")\n",
    "        documents = loader.load()\n",
    "        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
    "        texts = text_splitter.split_documents(documents)\n",
    "        return texts\n",
    "    \n",
    "    elif file_extension == \".png\" or file_extension == \".jpg\" or file_extension == \".jpeg\":\n",
    "        texts = get_text_img(doc)\n",
    "        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
    "        texts = text_splitter.create_documents(texts)\n",
    "        print(texts)\n",
    "        return texts\n",
    "        \n",
    "    else:\n",
    "        raise ValueError(f\"Unsupported file extension: {file_extension}\")\n",
    "\n",
    "    documents = loader.load()\n",
    "    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "    texts = text_splitter.split_documents(documents)\n",
    "\n",
    "    return texts\n",
    "  \n",
    "embedding = OpenAIEmbeddings(openai_api_key = \"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\")\n",
    "\n",
    "def upload_chroma(book_file,filename,email):\n",
    "    pbar = tqdm(total=100)\n",
    "    final_texts = get_text(book_file,filename)\n",
    "    pbar.update(40)\n",
    "    title = str(email)\n",
    "    persist_directory = os.path.join(base_path,title)\n",
    "    db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory)\n",
    "    pbar.update(40)\n",
    "    db.persist()\n",
    "    logging.info(f\"Successfully uploaded the PDF of the book: {title}\")\n",
    "    print(f\"Successfully uploaded the PDF of the book: {title}\")\n",
    "    pbar.update(20)\n",
    "    pbar.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc = \"/home/ubuntu/LLM/.conda/om/medAlpaca/eval/section4_mobile_screen.png\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ".png\n"
     ]
    }
   ],
   "source": [
    "file_extension = \".png\"\n",
    "print(file_extension)\n",
    "if file_extension == \".pdf\":\n",
    "    pdf = PyPDF2.PdfReader(doc)\n",
    "    pdf_text = \"\"\n",
    "    for page in pdf.pages:\n",
    "        pdf_text += page.extract_text()\n",
    "    \n",
    "elif file_extension == \".md\" or file_extension == \".txt\":\n",
    "    loader = TextLoader(doc)\n",
    "elif file_extension in [\".docx\", \".doc\"]:\n",
    "    loader = Docx2txtLoader(doc)\n",
    "elif file_extension == \".csv\":\n",
    "    loader = CSVLoader(file_path=doc)\n",
    "elif file_extension in [\".xls\", \".xlsx\"]:\n",
    "    try:\n",
    "        df = pd.read_excel(doc, engine='openpyxl')\n",
    "        file_name = f\"{str(uuid.uuid1())}.csv\"\n",
    "        df.to_csv(file_name)\n",
    "        loader = CSVLoader(file_path=file_name)\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        loader = UnstructuredExcelLoader(doc, mode=\"elements\")\n",
    "    documents = loader.load()\n",
    "    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
    "    texts = text_splitter.split_documents(documents)\n",
    "\n",
    "elif file_extension == \".png\" or file_extension == \".jpg\" or file_extension == \".jpeg\":\n",
    "    texts = get_text_img(doc)\n",
    "    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
    "    texts = text_splitter.create_documents(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Profile details  Payal Tandon  Female  etd Seen  Patient details  Name Surname Date of Birth city  Country  Shared profile  Rekha Singhn Tviews  Ey  Payal Tandon Luly 16, 1990 (30y) Mumbai  India    \\x0c'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts = get_text_img(doc)\n",
    "texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.schema.document import Document\n",
    "\n",
    "def get_text_chunks_langchain(text):\n",
    "   text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
    "   docs = [Document(page_content=x) for x in text_splitter.split_text(text)]\n",
    "   return docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Profile details  Payal Tandon  Female  etd Seen  Patient details  Name Surname Date of Birth city  Country  Shared profile  Rekha Singhn Tviews  Ey  Payal Tandon Luly 16, 1990 (30y) Mumbai  India', metadata={})]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_text_chunks_langchain(texts)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "med",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}