Spaces:
Runtime error
Runtime error
first
Browse files- Reference_Chat_with_Earnings_Calls_Transcripts.ipynb +0 -0
- app.py +120 -0
- requirements.txt +5 -0
Reference_Chat_with_Earnings_Calls_Transcripts.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# for setting/extracting environment variables such as API keys
|
2 |
+
import os
|
3 |
+
|
4 |
+
### 1. For Web Scraping
|
5 |
+
# for querying Financial Modelling Prep API
|
6 |
+
from urllib.request import urlopen
|
7 |
+
import json
|
8 |
+
|
9 |
+
### 2. For Converting Scraped Text Into a Vector Store of Chunked Documents
|
10 |
+
# for tokenizing texts and splitting them into chunks of documents
|
11 |
+
from transformers import GPT2TokenizerFast
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
# for turning documents into embeddings before putting them in vector store
|
14 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
+
# for vector store for documents
|
16 |
+
from langchain.vectorstores import Chroma
|
17 |
+
|
18 |
+
### 3. For Querying LLM
|
19 |
+
# for loading HuggingFace LLM models from the hub
|
20 |
+
from langchain.llms import HuggingFaceHub
|
21 |
+
# for querying LLM conveniently using the context
|
22 |
+
from langchain.chains.question_answering import load_qa_chain
|
23 |
+
|
24 |
+
### 4. For Gradio App UI
|
25 |
+
import gradio as gr
|
26 |
+
|
27 |
+
fmp_api_key = os.environ['FMP_API_KEY']
|
28 |
+
|
29 |
+
|
30 |
+
def get_jsonparsed_data(url):
|
31 |
+
response = urlopen(url)
|
32 |
+
data = response.read().decode("utf-8")
|
33 |
+
return json.loads(data)
|
34 |
+
|
35 |
+
# initialize the following tokenizers and splitters to tokenize and split the texts into chunks later (feel free to try others)
|
36 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
37 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=200, chunk_overlap=20)
|
38 |
+
|
39 |
+
# initialize the default model for embedding the tokenized texts, the articles will be stored in this embedded form in the vector database
|
40 |
+
hf_embeddings = HuggingFaceEmbeddings()
|
41 |
+
|
42 |
+
# Load the huggingface inference endpoint of an LLM model
|
43 |
+
# Name of the LLM model we are using, feel free to try others!
|
44 |
+
model = "mistralai/Mistral-7B-Instruct-v0.1"
|
45 |
+
|
46 |
+
# This is an inference endpoint API from huggingface, the model is not run locally, it is run on huggingface
|
47 |
+
hf_llm = HuggingFaceHub(repo_id=model,model_kwargs={'temperature':0.5,"max_new_tokens":300})
|
48 |
+
|
49 |
+
os.system("rm -r chromadb_earnings_transcripts_extracted")
|
50 |
+
os.system("rm earnings_transcripts_chromadb.zip")
|
51 |
+
os.system("wget https://github.com/damianboh/test_earnings_calls/raw/main/earnings_transcripts_chromadb.zip")
|
52 |
+
os.system("unzip earnings_transcripts_chromadb.zip -d chromadb_earnings_transcripts_extracted")
|
53 |
+
|
54 |
+
chroma_db = Chroma(persist_directory='chromadb_earnings_transcripts_extracted/chromadb_earnings_transcripts',embedding_function=hf_embeddings)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
def source_question_answer(query:str,vectorstore:Chroma=chroma_db,llm:HuggingFaceHub=hf_llm):
|
59 |
+
"""
|
60 |
+
Return answer to the query
|
61 |
+
"""
|
62 |
+
input_docs = vectorstore.similarity_search(query,k=4)
|
63 |
+
qa_chain = load_qa_chain(llm, chain_type="stuff")
|
64 |
+
query = f"[INST]According to the earnings calls transcripts earlier, {query}[INST]"
|
65 |
+
|
66 |
+
response = qa_chain.run(input_documents=input_docs, question=query)
|
67 |
+
source_docs_1 = input_docs[0].page_content
|
68 |
+
source_docs_2 = input_docs[1].page_content
|
69 |
+
source_docs_3 = input_docs[2].page_content
|
70 |
+
source_docs_4 = input_docs[3].page_content
|
71 |
+
|
72 |
+
source_title_1 = input_docs[0].metadata['title']
|
73 |
+
source_title_2 = input_docs[1].metadata['title']
|
74 |
+
source_title_3 = input_docs[2].metadata['title']
|
75 |
+
source_title_4 = input_docs[3].metadata['title']
|
76 |
+
|
77 |
+
return response,source_docs_1 ,source_docs_2,source_docs_3,source_docs_4, source_title_1, source_title_2, source_title_3, source_title_4
|
78 |
+
|
79 |
+
|
80 |
+
with gr.Blocks() as app:
|
81 |
+
|
82 |
+
with gr.Row():
|
83 |
+
gr.HTML("<h1>Chat with Tesla 2023 Earnings Calls Transcripts</h1>")
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
query = gr.Textbox("Is Elon happy about Tesla?", placeholder="Enter question here...", label="Enter question")
|
87 |
+
btn = gr.Button("Ask Question")
|
88 |
+
|
89 |
+
with gr.Row():
|
90 |
+
gr.HTML("<h3>Answer</h3>")
|
91 |
+
|
92 |
+
with gr.Row():
|
93 |
+
answer = gr.Textbox(label="Answer")
|
94 |
+
|
95 |
+
with gr.Row():
|
96 |
+
gr.HTML("<h3>Sources Referenced from Tesla 2023 Earnings Calls Transcripts</h3>")
|
97 |
+
|
98 |
+
with gr.Row():
|
99 |
+
with gr.Column():
|
100 |
+
source_title_1 = gr.Markdown()
|
101 |
+
source1 = gr.Textbox(label="Source Text 1")
|
102 |
+
with gr.Column():
|
103 |
+
source_title_2 = gr.Markdown()
|
104 |
+
source2 = gr.Textbox(label="Source Text 2")
|
105 |
+
|
106 |
+
with gr.Row():
|
107 |
+
with gr.Column():
|
108 |
+
source_title_3 = gr.Markdown()
|
109 |
+
source3 = gr.Textbox(label="Source Text 3")
|
110 |
+
with gr.Column():
|
111 |
+
source_title_4 = gr.Markdown()
|
112 |
+
source4 = gr.Textbox(label="Source Text 4")
|
113 |
+
|
114 |
+
query.submit(fn=source_question_answer, inputs=[query],
|
115 |
+
outputs=[answer, source1, source2, source3, source4, source_title_1, source_title_2, source_title_3, source_title_4])
|
116 |
+
|
117 |
+
btn.click(fn=source_question_answer, inputs=[query],
|
118 |
+
outputs=[answer, source1, source2, source3, source4, source_title_1, source_title_2, source_title_3, source_title_4])
|
119 |
+
|
120 |
+
app.launch(share=True, debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.45.2
|
2 |
+
chromadb==0.4.13
|
3 |
+
langchain==0.0.305
|
4 |
+
transformers==4.33.3
|
5 |
+
sentence-transformers==2.2.2
|