Spaces:
Sleeping
Sleeping
File size: 6,111 Bytes
2846658 35c8ded 2846658 35c8ded 2846658 35c8ded 2846658 59b0827 35c8ded 2846658 35c8ded 2846658 35c8ded 2846658 35c8ded 2846658 35c8ded 2846658 35c8ded |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from typing import List
from haystack.dataclasses import ChatMessage
from pypdf import PdfReader
from haystack.utils import Secret
from haystack import Pipeline, Document, component
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import DynamicChatPromptBuilder
from haystack.components.generators.chat import OpenAIChatGenerator, HuggingFaceTGIChatGenerator
from haystack.document_stores.types import DuplicatePolicy
SENTENCE_RETREIVER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 500
template = """
As a professional HR recruiter given the following information, answer the question shortly and concisely in 1 or 2 sentences.
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
@component
class UploadedFileConverter:
"""
A component to convert uploaded PDF files to Documents
"""
@component.output_types(documents=List[Document])
def run(self, uploaded_file):
pdf = PdfReader(uploaded_file)
documents = []
# uploaded file name without .pdf at the end and with _ and page number at the end
name = uploaded_file.name.rstrip('.PDF') + '_'
for page in pdf.pages:
documents.append(
Document(
content=page.extract_text(),
meta={'name': name + f"_{page.page_number}"}))
return {"documents": documents}
def create_ingestion_pipeline(document_store):
doc_embedder = SentenceTransformersDocumentEmbedder(model=SENTENCE_RETREIVER_MODEL)
doc_embedder.warm_up()
pipeline = Pipeline()
pipeline.add_component("converter", UploadedFileConverter())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter",
DocumentSplitter(split_by="passage", split_length=100, split_overlap=10))
pipeline.add_component("embedder", doc_embedder)
pipeline.add_component("writer",
DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
return pipeline
def create_inference_pipeline(document_store, model_name, api_key):
if model_name == "local LLM":
generator = OpenAIChatGenerator(api_key=Secret.from_token("<local LLM doesn't need an API key>"),
model=model_name,
api_base_url="http://localhost:1234/v1",
generation_kwargs={"max_tokens": MAX_TOKENS}
)
elif "gpt" in model_name:
generator = OpenAIChatGenerator(api_key=Secret.from_token(api_key), model=model_name,
generation_kwargs={"max_tokens": MAX_TOKENS, "stream": False}
)
else:
generator = HuggingFaceTGIChatGenerator(token=Secret.from_token(api_key), model=model_name,
generation_kwargs={"max_new_tokens": MAX_TOKENS}
)
pipeline = Pipeline()
pipeline.add_component("text_embedder",
SentenceTransformersTextEmbedder(model=SENTENCE_RETREIVER_MODEL))
pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
pipeline.add_component("prompt_builder",
DynamicChatPromptBuilder(runtime_variables=["query", "documents"]))
pipeline.add_component("llm", generator)
pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
pipeline.connect("retriever.documents", "prompt_builder.documents")
pipeline.connect("prompt_builder.prompt", "llm.messages")
return pipeline
class DocumentQAEngine:
def __init__(self,
model_name,
api_key=None
):
self.api_key = api_key
self.model_name = model_name
document_store = InMemoryDocumentStore()
self.chunks = []
self.inference_pipeline = create_inference_pipeline(document_store, model_name, api_key)
self.pdf_ingestion_pipeline = create_ingestion_pipeline(document_store)
def ingest_pdf(self, uploaded_file):
self.pdf_ingestion_pipeline.run({"converter": {"uploaded_file": uploaded_file}})
def inference(self, query, input_messages: List[dict]):
system_message = ChatMessage.from_system(
"You are a professional HR recruiter that answers questions based on the content of the uploaded CV. in 1 or 2 sentences.")
messages = [system_message]
for message in input_messages:
if message["role"] == "user":
messages.append(ChatMessage.from_system(message["content"]))
else:
messages.append(
ChatMessage.from_user(message["content"]))
messages.append(ChatMessage.from_user("""
Relevant information from the uploaded CV:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
\nQuestion: {{query}}
\nAnswer:
"""))
res = self.inference_pipeline.run(data={"text_embedder": {"text": query},
"prompt_builder": {"prompt_source": messages,
"query": query
}})
return res["llm"]["replies"][0].content
|