File size: 2,684 Bytes
cdbb5c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
pip install -qU langchain-community faiss-cpu faiss-gpu langchain-openai sentence_transformers gradio

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os
import pandas as pd
from uuid import uuid4
from langchain_core.documents import Document
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain import PromptTemplate
import gradio as gr

df = pd.read_csv('news_paper-Cleaned.csv', encoding='utf-8', on_bad_lines='skip')

os.environ["OPENAI_API_KEY"] = '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'
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = [{
    'title': row['title'],
    'author': row['author'],
    'description': row['description'],
    'full_text' : row['full_text']

}
    for _, row in df.iterrows()]

full_text = [Document(
    page_content=str(doc),
    metadata={"source": "news"},
) for doc in documents]

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

text_split = text_splitter.split_documents(full_text)

uuids = [str(uuid4()) for _ in range(len(text_split))]

vector_store.add_documents(documents=text_split, ids=uuids)

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})

def questions(query):

    template = """
    You are a helpful assistant that that can answer questions about specific data.
    You have answer only from this Context.
    You will receive 10 Answer return all and spilt between them by new line.

    Question: {question}
    Context: {context}
    Answer: 
    """


    PROMPT = PromptTemplate(template=template, input_variables=['question', 'context'])

    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(), 
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT},
    )

    return qa_chain({"query": query})['result']


demo = gr.Interface(fn=questions, inputs="text", outputs="text")
demo.launch()