Spaces:
Sleeping
Sleeping
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import Pinecone | |
from sentence_transformers import SentenceTransformer | |
from langchain.chains.question_answering import load_qa_chain | |
import pinecone | |
import os | |
from langchain.llms import LlamaCpp | |
from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from huggingface_hub import hf_hub_download | |
import streamlit as st | |
from streamlit_chat import message | |
# loader = OnlinePDFLoader("food.pdf") | |
loader = PyPDFLoader("Ramesh_kumar_Resume.pdf") | |
data = loader.load() | |
# data | |
text_splitter=RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0) | |
docs=text_splitter.split_documents(data) | |
len(docs) | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"] | |
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', st.secrets["PINECONE"]) | |
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter') | |
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY | |
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
# initialize pinecone | |
pinecone.Pinecone( | |
api_key=PINECONE_API_KEY, # find at app.pinecone.io | |
environment=PINECONE_API_ENV # next to api key in console | |
) | |
index_name = "testindex" # put in the name of your pinecone index here | |
docsearch= Pinecone.from_texts([t.page_content for t in docs],embeddings, index_name=index_name) | |
query="what languages ramesh know?" | |
docs=docsearch.similarity_search(query,k=1) | |
# docs | |
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose | |
# Callbacks support token-wise streaming | |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) | |
# Verbose is required to pass to the callback manager | |
model_name_or_path = "anakin87/gemma-2b-orpo-GGUF" | |
filename = "gemma-2b-orpo.Q5_K_M.gguf" # the model is in bin format | |
model_path = hf_hub_download(repo_id=model_name_or_path, filename=filename) | |
n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. | |
n_batch = 256 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. | |
# Loading model, | |
query=st.text_input("Ask questions:") | |
if query: | |
search=docsearch.similarity_search(query) | |
llm = LlamaCpp( | |
model_path=model_path, | |
max_tokens=256, | |
# n_gpu_layers=n_gpu_layers, | |
# n_batch=n_batch, | |
callback_manager=callback_manager, | |
n_ctx=1024, | |
verbose=True, | |
) | |
chain=load_qa_chain(llm, chain_type="stuff") | |
response = chain.run(input_documents=search, question=query) | |
st.write(response) | |
# st.session_state['messages'].append({"role": "user", "content": query}) | |
# st.session_state['messages'].append({"role": "assistant", "content": response}) | |
# response_container = st.container() | |
# # container for text box | |
# container = st.container() | |
# with container: | |
# if query: | |
# output = response | |
# st.session_state['past'].append(query) | |
# st.session_state['generated'].append(output) | |
# if st.session_state['generated']: | |
# with response_container: | |
# for i in range(len(st.session_state['generated'])): | |
# message(st.session_state["past"][i], is_user=True, key=str(i) + '_user') | |
# message(st.session_state["generated"][i], key=str(i)) |