File size: 6,490 Bytes
d727a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import langchain
import sqlite3
from langchain.document_loaders import PyPDFLoader  
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
import openai
import os
import PyPDF2
from langchain.document_loaders.csv_loader import CSVLoader
from langchain import OpenAI, PromptTemplate
from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader
import logging
from tqdm import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import uuid
from PIL import Image

from utils import get_completion,model_info,model_load

import pytesseract
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document

def get_text_chunks_langchain(text):
   text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
   docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
   return docs

def get_text_img(path):
    return pytesseract.image_to_string(Image.open(path))

logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

base_path = os.path.join(os.getcwd(),"db")
key_openai ="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"
embedding = OpenAIEmbeddings(openai_api_key =key_openai)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM")

data_llm_16k = ChatOpenAI(
        model_name="gpt-3.5-turbo-16k",
        temperature = 0,
        openai_api_key=key_openai,
    )

data_llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature = 0,
        openai_api_key=key_openai,
    )

chain = load_summarize_chain(data_llm_16k, chain_type="stuff")

def get_qa_chain_answers_llm(question,email):
    title = str(email)
    persist_directory = os.path.join(base_path,title)
    db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    k_tops = db.similarity_search(question, k=3)
    print(k_tops)
    #question_new = f" 'context' {k_tops}: '{question}'"
    #res = get_completion(question_new, 300, 0)
    print("LLM MODEL------------------------------")
    messages = []
    messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge in simplier langauge. Next you will talk with the paitent"})
    model.chat(tokenizer, messages)
    messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"})
    return model.chat(tokenizer, messages)

# def get_qa_chain_answers(question,email,history=[]):
#     title = str(email)
#     persist_directory = os.path.join(base_path,title)
#     db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    
#     # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm)
#     # unique_docs = retriever_from_llm.get_relevant_documents(query=question)

#     qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever())
#     question_updated = "Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked" + question
#     answers = qa_chain({"query": question_updated})
#     return answers['result']
  
def get_text(doc,file_name):
    file_extension = os.path.splitext(file_name)[1].lower()
    print(file_extension)
    if file_extension == ".pdf":
        pdf = PyPDF2.PdfReader(doc)
        pdf_text = ""
        for page in pdf.pages:
            pdf_text += page.extract_text()
        return pdf_text
        
    elif file_extension == ".md" or file_extension == ".txt":
        loader = TextLoader(doc)
    elif file_extension in [".docx", ".doc"]:
        loader = Docx2txtLoader(doc)
    elif file_extension == ".csv":
        loader = CSVLoader(file_path=doc)
    elif file_extension in [".xls", ".xlsx"]:
        try:
            df = pd.read_excel(doc, engine='openpyxl')
            file_name = f"{str(uuid.uuid1())}.csv"
            df.to_csv(file_name)
            loader = CSVLoader(file_path=file_name)
        except Exception as e:
            print(e)
            loader = UnstructuredExcelLoader(doc, mode="elements")
        documents = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        texts = text_splitter.split_documents(documents)
        return texts
    
    elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg":
        texts = get_text_img(doc)
        text_docs = get_text_chunks_langchain(texts)
        return text_docs
        
    else:
        raise ValueError(f"Unsupported file extension: {file_extension}")

    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    return texts
  
embedding = OpenAIEmbeddings(openai_api_key = "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")

def upload_chroma(book_file,filename,email):
    pbar = tqdm(total=100)
    final_texts = get_text(book_file,filename)
    pbar.update(40)
    title = str(email)
    persist_directory = os.path.join(base_path,title)
    db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory)
    pbar.update(40)
    db.persist()
    logging.info(f"Successfully uploaded the PDF of the book: {title}")
    print(f"Successfully uploaded the PDF of the book: {title}")
    pbar.update(20)
    pbar.close()