File size: 6,210 Bytes
2846658
35c8ded
 
2846658
 
 
 
 
 
 
 
 
35c8ded
2846658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35c8ded
2846658
59b0827
 
35c8ded
 
 
2846658
35c8ded
d513adc
 
35c8ded
2846658
35c8ded
 
 
 
 
 
 
 
 
 
 
 
 
2846658
35c8ded
2846658
 
 
 
 
 
 
 
 
 
 
35c8ded
2846658
 
 
 
 
35c8ded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from typing import List

from haystack.dataclasses import ChatMessage
from pypdf import PdfReader
from haystack.utils import Secret
from haystack import Pipeline, Document, component

from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import DynamicChatPromptBuilder
from haystack.components.generators.chat import OpenAIChatGenerator, HuggingFaceTGIChatGenerator
from haystack.document_stores.types import DuplicatePolicy

SENTENCE_RETREIVER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

MAX_TOKENS = 500

template = """
As a professional HR recruiter given the following information, answer the question shortly and concisely in 1 or 2 sentences.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""


@component
class UploadedFileConverter:
    """
    A component to convert uploaded PDF files to Documents
    """

    @component.output_types(documents=List[Document])
    def run(self, uploaded_file):
        pdf = PdfReader(uploaded_file)
        documents = []
        # uploaded file name without .pdf at the end and with _ and page number at the end
        name = uploaded_file.name.rstrip('.PDF') + '_'
        for page in pdf.pages:
            documents.append(
                Document(
                    content=page.extract_text(),
                    meta={'name': name + f"_{page.page_number}"}))
        return {"documents": documents}


def create_ingestion_pipeline(document_store):
    doc_embedder = SentenceTransformersDocumentEmbedder(model=SENTENCE_RETREIVER_MODEL)
    doc_embedder.warm_up()

    pipeline = Pipeline()
    pipeline.add_component("converter", UploadedFileConverter())
    pipeline.add_component("cleaner", DocumentCleaner())
    pipeline.add_component("splitter",
                           DocumentSplitter(split_by="passage", split_length=100, split_overlap=10))
    pipeline.add_component("embedder", doc_embedder)
    pipeline.add_component("writer",
                           DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))

    pipeline.connect("converter", "cleaner")
    pipeline.connect("cleaner", "splitter")
    pipeline.connect("splitter", "embedder")
    pipeline.connect("embedder", "writer")
    return pipeline


def create_inference_pipeline(document_store, model_name, api_key):
    if model_name == "local LLM":
        generator = OpenAIChatGenerator(api_key=Secret.from_token("<local LLM doesn't need an API key>"),
                                        model=model_name,
                                        api_base_url="http://localhost:1234/v1",
                                        generation_kwargs={"max_tokens": MAX_TOKENS}
                                        )
    elif "gpt" in model_name:
        generator = OpenAIChatGenerator(api_key=Secret.from_token(api_key), model=model_name,
                                        generation_kwargs={"max_tokens": MAX_TOKENS},
                                        streaming_callback=lambda chunk: print(chunk.content, end="", flush=True),
                                        )
    else:
        generator = HuggingFaceTGIChatGenerator(token=Secret.from_token(api_key), model=model_name,
                                                generation_kwargs={"max_new_tokens": MAX_TOKENS}
                                                )
    pipeline = Pipeline()
    pipeline.add_component("text_embedder",
                           SentenceTransformersTextEmbedder(model=SENTENCE_RETREIVER_MODEL))
    pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
    pipeline.add_component("prompt_builder",
                           DynamicChatPromptBuilder(runtime_variables=["query", "documents"]))
    pipeline.add_component("llm", generator)
    pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
    pipeline.connect("retriever.documents", "prompt_builder.documents")
    pipeline.connect("prompt_builder.prompt", "llm.messages")

    return pipeline


class DocumentQAEngine:
    def __init__(self,
                 model_name,
                 api_key=None
                 ):
        self.api_key = api_key
        self.model_name = model_name
        document_store = InMemoryDocumentStore()
        self.chunks = []
        self.inference_pipeline = create_inference_pipeline(document_store, model_name, api_key)
        self.pdf_ingestion_pipeline = create_ingestion_pipeline(document_store)

    def ingest_pdf(self, uploaded_file):
        self.pdf_ingestion_pipeline.run({"converter": {"uploaded_file": uploaded_file}})

    def inference(self, query, input_messages: List[dict]):
        system_message = ChatMessage.from_system(
            "You are a professional HR recruiter that answers questions based on the content of the uploaded CV. in 1 or 2 sentences.")
        messages = [system_message]
        for message in input_messages:
            if message["role"] == "user":
                messages.append(ChatMessage.from_system(message["content"]))
            else:
                messages.append(
                    ChatMessage.from_user(message["content"]))
        messages.append(ChatMessage.from_user("""
        Relevant information from the uploaded CV:
            {% for doc in documents %}
                {{ doc.content }}
            {% endfor %}

            \nQuestion: {{query}}
            \nAnswer:
        """))
        res = self.inference_pipeline.run(data={"text_embedder": {"text": query},
                                                "prompt_builder": {"prompt_source": messages,
                                                                   "query": query
                                                                   }})
        return res["llm"]["replies"][0].content