llamaresume / app.py
ramesh28's picture
added new model refernce
a0178ec verified
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
import streamlit as st
from streamlit_chat import message
# loader = OnlinePDFLoader("food.pdf")
loader = PyPDFLoader("Ramesh_kumar_Resume.pdf")
data = loader.load()
# data
text_splitter=RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)
docs=text_splitter.split_documents(data)
len(docs)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', st.secrets["PINECONE"])
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# initialize pinecone
pinecone.Pinecone(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_API_ENV # next to api key in console
)
index_name = "testindex" # put in the name of your pinecone index here
docsearch= Pinecone.from_texts([t.page_content for t in docs],embeddings, index_name=index_name)
query="what languages ramesh know?"
docs=docsearch.similarity_search(query,k=1)
# docs
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager
model_name_or_path = "anakin87/gemma-2b-orpo-GGUF"
filename = "gemma-2b-orpo.Q5_K_M.gguf" # the model is in bin format
model_path = hf_hub_download(repo_id=model_name_or_path, filename=filename)
n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool.
n_batch = 256 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
# Loading model,
query=st.text_input("Ask questions:")
if query:
search=docsearch.similarity_search(query)
llm = LlamaCpp(
model_path=model_path,
max_tokens=256,
# n_gpu_layers=n_gpu_layers,
# n_batch=n_batch,
callback_manager=callback_manager,
n_ctx=1024,
verbose=True,
)
chain=load_qa_chain(llm, chain_type="stuff")
response = chain.run(input_documents=search, question=query)
st.write(response)
# st.session_state['messages'].append({"role": "user", "content": query})
# st.session_state['messages'].append({"role": "assistant", "content": response})
# response_container = st.container()
# # container for text box
# container = st.container()
# with container:
# if query:
# output = response
# st.session_state['past'].append(query)
# st.session_state['generated'].append(output)
# if st.session_state['generated']:
# with response_container:
# for i in range(len(st.session_state['generated'])):
# message(st.session_state["past"][i], is_user=True, key=str(i) + '_user')
# message(st.session_state["generated"][i], key=str(i))