amigov1 / file_handle.py
asach's picture
Upload folder using huggingface_hub
d727a17
raw
history blame
6.49 kB
import os
import langchain
import sqlite3
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
import openai
import os
import PyPDF2
from langchain.document_loaders.csv_loader import CSVLoader
from langchain import OpenAI, PromptTemplate
from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader
import logging
from tqdm import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import uuid
from PIL import Image
from utils import get_completion,model_info,model_load
import pytesseract
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document
def get_text_chunks_langchain(text):
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
return docs
def get_text_img(path):
return pytesseract.image_to_string(Image.open(path))
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
base_path = os.path.join(os.getcwd(),"db")
key_openai ="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"
embedding = OpenAIEmbeddings(openai_api_key =key_openai)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM")
data_llm_16k = ChatOpenAI(
model_name="gpt-3.5-turbo-16k",
temperature = 0,
openai_api_key=key_openai,
)
data_llm = ChatOpenAI(
model_name="gpt-3.5-turbo",
temperature = 0,
openai_api_key=key_openai,
)
chain = load_summarize_chain(data_llm_16k, chain_type="stuff")
def get_qa_chain_answers_llm(question,email):
title = str(email)
persist_directory = os.path.join(base_path,title)
db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
k_tops = db.similarity_search(question, k=3)
print(k_tops)
#question_new = f" 'context' {k_tops}: '{question}'"
#res = get_completion(question_new, 300, 0)
print("LLM MODEL------------------------------")
messages = []
messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge in simplier langauge. Next you will talk with the paitent"})
model.chat(tokenizer, messages)
messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"})
return model.chat(tokenizer, messages)
# def get_qa_chain_answers(question,email,history=[]):
# title = str(email)
# persist_directory = os.path.join(base_path,title)
# db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm)
# # unique_docs = retriever_from_llm.get_relevant_documents(query=question)
# qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever())
# question_updated = "Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked" + question
# answers = qa_chain({"query": question_updated})
# return answers['result']
def get_text(doc,file_name):
file_extension = os.path.splitext(file_name)[1].lower()
print(file_extension)
if file_extension == ".pdf":
pdf = PyPDF2.PdfReader(doc)
pdf_text = ""
for page in pdf.pages:
pdf_text += page.extract_text()
return pdf_text
elif file_extension == ".md" or file_extension == ".txt":
loader = TextLoader(doc)
elif file_extension in [".docx", ".doc"]:
loader = Docx2txtLoader(doc)
elif file_extension == ".csv":
loader = CSVLoader(file_path=doc)
elif file_extension in [".xls", ".xlsx"]:
try:
df = pd.read_excel(doc, engine='openpyxl')
file_name = f"{str(uuid.uuid1())}.csv"
df.to_csv(file_name)
loader = CSVLoader(file_path=file_name)
except Exception as e:
print(e)
loader = UnstructuredExcelLoader(doc, mode="elements")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
return texts
elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg":
texts = get_text_img(doc)
text_docs = get_text_chunks_langchain(texts)
return text_docs
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
return texts
embedding = OpenAIEmbeddings(openai_api_key = "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")
def upload_chroma(book_file,filename,email):
pbar = tqdm(total=100)
final_texts = get_text(book_file,filename)
pbar.update(40)
title = str(email)
persist_directory = os.path.join(base_path,title)
db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory)
pbar.update(40)
db.persist()
logging.info(f"Successfully uploaded the PDF of the book: {title}")
print(f"Successfully uploaded the PDF of the book: {title}")
pbar.update(20)
pbar.close()