File size: 6,490 Bytes
d727a17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import os
import langchain
import sqlite3
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
import openai
import os
import PyPDF2
from langchain.document_loaders.csv_loader import CSVLoader
from langchain import OpenAI, PromptTemplate
from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader
import logging
from tqdm import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import uuid
from PIL import Image
from utils import get_completion,model_info,model_load
import pytesseract
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document
def get_text_chunks_langchain(text):
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
return docs
def get_text_img(path):
return pytesseract.image_to_string(Image.open(path))
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
base_path = os.path.join(os.getcwd(),"db")
key_openai ="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"
embedding = OpenAIEmbeddings(openai_api_key =key_openai)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM")
data_llm_16k = ChatOpenAI(
model_name="gpt-3.5-turbo-16k",
temperature = 0,
openai_api_key=key_openai,
)
data_llm = ChatOpenAI(
model_name="gpt-3.5-turbo",
temperature = 0,
openai_api_key=key_openai,
)
chain = load_summarize_chain(data_llm_16k, chain_type="stuff")
def get_qa_chain_answers_llm(question,email):
title = str(email)
persist_directory = os.path.join(base_path,title)
db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
k_tops = db.similarity_search(question, k=3)
print(k_tops)
#question_new = f" 'context' {k_tops}: '{question}'"
#res = get_completion(question_new, 300, 0)
print("LLM MODEL------------------------------")
messages = []
messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge in simplier langauge. Next you will talk with the paitent"})
model.chat(tokenizer, messages)
messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"})
return model.chat(tokenizer, messages)
# def get_qa_chain_answers(question,email,history=[]):
# title = str(email)
# persist_directory = os.path.join(base_path,title)
# db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm)
# # unique_docs = retriever_from_llm.get_relevant_documents(query=question)
# qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever())
# question_updated = "Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked" + question
# answers = qa_chain({"query": question_updated})
# return answers['result']
def get_text(doc,file_name):
file_extension = os.path.splitext(file_name)[1].lower()
print(file_extension)
if file_extension == ".pdf":
pdf = PyPDF2.PdfReader(doc)
pdf_text = ""
for page in pdf.pages:
pdf_text += page.extract_text()
return pdf_text
elif file_extension == ".md" or file_extension == ".txt":
loader = TextLoader(doc)
elif file_extension in [".docx", ".doc"]:
loader = Docx2txtLoader(doc)
elif file_extension == ".csv":
loader = CSVLoader(file_path=doc)
elif file_extension in [".xls", ".xlsx"]:
try:
df = pd.read_excel(doc, engine='openpyxl')
file_name = f"{str(uuid.uuid1())}.csv"
df.to_csv(file_name)
loader = CSVLoader(file_path=file_name)
except Exception as e:
print(e)
loader = UnstructuredExcelLoader(doc, mode="elements")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
return texts
elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg":
texts = get_text_img(doc)
text_docs = get_text_chunks_langchain(texts)
return text_docs
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
return texts
embedding = OpenAIEmbeddings(openai_api_key = "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")
def upload_chroma(book_file,filename,email):
pbar = tqdm(total=100)
final_texts = get_text(book_file,filename)
pbar.update(40)
title = str(email)
persist_directory = os.path.join(base_path,title)
db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory)
pbar.update(40)
db.persist()
logging.info(f"Successfully uploaded the PDF of the book: {title}")
print(f"Successfully uploaded the PDF of the book: {title}")
pbar.update(20)
pbar.close() |