from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Pinecone from sentence_transformers import SentenceTransformer from langchain.chains.question_answering import load_qa_chain import pinecone import os from langchain.llms import LlamaCpp from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from huggingface_hub import hf_hub_download import streamlit as st from streamlit_chat import message # loader = OnlinePDFLoader("food.pdf") loader = PyPDFLoader("Ramesh_kumar_Resume.pdf") data = loader.load() # data text_splitter=RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0) docs=text_splitter.split_documents(data) len(docs) os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"] PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', st.secrets["PINECONE"]) PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter') os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # initialize pinecone pinecone.Pinecone( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment=PINECONE_API_ENV # next to api key in console ) index_name = "testindex" # put in the name of your pinecone index here docsearch= Pinecone.from_texts([t.page_content for t in docs],embeddings, index_name=index_name) query="what languages ramesh know?" docs=docsearch.similarity_search(query,k=1) # docs # !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose # Callbacks support token-wise streaming callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # Verbose is required to pass to the callback manager model_name_or_path = "anakin87/gemma-2b-orpo-GGUF" filename = "gemma-2b-orpo.Q5_K_M.gguf" # the model is in bin format model_path = hf_hub_download(repo_id=model_name_or_path, filename=filename) n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. n_batch = 256 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. # Loading model, query=st.text_input("Ask questions:") if query: search=docsearch.similarity_search(query) llm = LlamaCpp( model_path=model_path, max_tokens=256, # n_gpu_layers=n_gpu_layers, # n_batch=n_batch, callback_manager=callback_manager, n_ctx=1024, verbose=True, ) chain=load_qa_chain(llm, chain_type="stuff") response = chain.run(input_documents=search, question=query) st.write(response) # st.session_state['messages'].append({"role": "user", "content": query}) # st.session_state['messages'].append({"role": "assistant", "content": response}) # response_container = st.container() # # container for text box # container = st.container() # with container: # if query: # output = response # st.session_state['past'].append(query) # st.session_state['generated'].append(output) # if st.session_state['generated']: # with response_container: # for i in range(len(st.session_state['generated'])): # message(st.session_state["past"][i], is_user=True, key=str(i) + '_user') # message(st.session_state["generated"][i], key=str(i))