""" |
creator: Lewis Kamau Kimaru |
Function: chat with pdf documents in different languages |
""" |
from langchain.text_splitter import CharacterTextSplitter |
from langchain.embeddings import HuggingFaceBgeEmbeddings |
from langchain.vectorstores import FAISS |
from langchain.chat_models import ChatOpenAI |
from langchain.memory import ConversationBufferMemory |
from langchain.chains import ConversationalRetrievalChain |
from langchain.llms import HuggingFaceHub |
from typing import Union |
from dotenv import load_dotenv |
from PyPDF2 import PdfReader |
import streamlit as st |
import requests |
import json |
import os |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets['huggingface_token'] |
st.set_page_config(page_title="SemaNaPDF", page_icon="📚",) |
Public_Url = 'https://lewiskimaru-helloworld.hf.space/' |
def translate(userinput, target_lang, source_lang=None): |
if source_lang: |
url = f"{Public_Url}/translate_enter/" |
data = { |
"userinput": userinput, |
"source_lang": source_lang, |
"target_lang": target_lang, |
} |
response = requests.post(url, json=data) |
result = response.json() |
print(type(result)) |
source_lange = source_lang |
translation = result['translated_text'] |
return source_lange, translation |
else: |
url = f"{Public_Url}/translate_detect/" |
data = { |
"userinput": userinput, |
"target_lang": target_lang, |
} |
response = requests.post(url, json=data) |
result = response.json() |
source_lange = result['source_language'] |
translation = result['translated_text'] |
return source_lange, translation |
def get_pdf_text(pdf : Union[str, bytes, bytearray]) -> str: |
reader = PdfReader(pdf) |
pdf_text = '' |
for page in (reader.pages): |
text = page.extract_text() |
if text: |
pdf_text += text |
return text |
def get_text_chunks(text:str) ->list: |
text_splitter = CharacterTextSplitter( |
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len |
) |
chunks = text_splitter.split_text(text) |
return chunks |
def get_vectorstore(text_chunks : list) -> FAISS: |
model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
encode_kwargs = { |
"normalize_embeddings": True |
} |
embeddings = HuggingFaceBgeEmbeddings( |
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"} |
) |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) |
return vectorstore |
def get_conversation_chain(vectorstore:FAISS) -> ConversationalRetrievalChain: |
llm = HuggingFaceHub( |
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", |
model_kwargs={"temperature": 0.5, "max_length": 1048}, |
) |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) |
conversation_chain = ConversationalRetrievalChain.from_llm( |
llm=llm, retriever=vectorstore.as_retriever(), memory=memory |
) |
return conversation_chain |
st.markdown(""" |
<style> |
div.stSpinner> { |
text-align:center; |
align-items: center; |
justify-content: center; |
}div |
</style>""", unsafe_allow_html=True) |
def main(): |
st.title("SemaNaPDF📚") |
pdf = st.file_uploader("Upload a PDF Document", type="pdf") |
if pdf is not None: |
with st.spinner("processing"): |
raw_text = get_pdf_text(pdf) |
text_chunks = get_text_chunks(raw_text) |
vectorstore = get_vectorstore(text_chunks) |
st.session_state.conversation = get_conversation_chain(vectorstore) |
st.info("done") |
if "messages" not in st.session_state: |
st.session_state.messages = [] |
for message in st.session_state.messages: |
with st.chat_message(message["role"]): |
st.markdown(message["content"]) |
if user_question := st.chat_input("Ask your document anything ......?"): |
with st.chat_message("user"): |
st.markdown(user_question) |
user_langd, Queryd = translate(user_question, 'eng_Latn') |
st.session_state.messages.append({"role": "user", "content": user_question}) |
response = st.session_state.conversation({"question": user_question}) |
st.session_state.chat_history = response["chat_history"] |
output = translate(response['answer'], user_langd, 'eng_Latn')[1] |
with st.chat_message("assistant"): |
st.markdown(output) |
st.session_state.messages.append({"role": "assistant", "content": response['answer']}) |
st.markdown( |
""" |
<div style="position: fixed; bottom: 0; right: 0; padding: 10px;"> |
<a href="https://kamaukimaru.vercel.app" target="_blank" style="font-size: 12px; color: #269129; text-decoration: none;">©2023 Lewis Kimaru. All rights reserved.</a> |
</div> |
""", |
unsafe_allow_html=True |
) |
if __name__ == '__main__': |
main() |