Spaces:
Runtime error
Runtime error
Joan Giner
commited on
Commit
·
f87e387
1
Parent(s):
460caa6
upgraded openai and langchain versions
Browse files- app.py +6 -4
- requirements.txt +18 -7
- src/extractor.py +22 -21
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
import openai
|
2 |
import gradio as gr
|
3 |
-
from
|
4 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
5 |
from langchain.vectorstores.faiss import FAISS
|
6 |
from langchain.chains.question_answering import load_qa_chain
|
7 |
from langchain.chains import LLMChain
|
8 |
-
from
|
9 |
-
from langchain import PromptTemplate
|
10 |
from langchain.docstore.document import Document
|
11 |
import pandas as pd
|
12 |
import os
|
@@ -24,7 +24,7 @@ load_dotenv()
|
|
24 |
#openai.api_key=os.getenv("OPEN_AI_API_KEY")
|
25 |
#LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
|
26 |
extractor = Extractor()
|
27 |
-
|
28 |
# Define function to handle the Gradio interface
|
29 |
async def extraction(input_file, apikey, dimension):
|
30 |
# Build the chains
|
@@ -55,6 +55,8 @@ async def ui_extraction(input_file, apikey, dimension):
|
|
55 |
raise gr.Error("Please upload a data paper")
|
56 |
if (input_file.name.split(".")[-1] != "pdf"):
|
57 |
raise gr.Error("This is not a data paper!, please upload it in .pdf format")
|
|
|
|
|
58 |
file_name = input_file.name.split("/")[-1]
|
59 |
results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
|
60 |
# Build results in the correct format for the Gradio front-end
|
|
|
1 |
import openai
|
2 |
import gradio as gr
|
3 |
+
from langchain_openai import OpenAIEmbeddings
|
4 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
5 |
from langchain.vectorstores.faiss import FAISS
|
6 |
from langchain.chains.question_answering import load_qa_chain
|
7 |
from langchain.chains import LLMChain
|
8 |
+
from langchain_community.llms import OpenAI
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
from langchain.docstore.document import Document
|
11 |
import pandas as pd
|
12 |
import os
|
|
|
24 |
#openai.api_key=os.getenv("OPEN_AI_API_KEY")
|
25 |
#LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
|
26 |
extractor = Extractor()
|
27 |
+
print(os.getenv("OPEN_AI_API_KEY"))
|
28 |
# Define function to handle the Gradio interface
|
29 |
async def extraction(input_file, apikey, dimension):
|
30 |
# Build the chains
|
|
|
55 |
raise gr.Error("Please upload a data paper")
|
56 |
if (input_file.name.split(".")[-1] != "pdf"):
|
57 |
raise gr.Error("This is not a data paper!, please upload it in .pdf format")
|
58 |
+
if (len(apikey) == 0):
|
59 |
+
raise gr.Error("Please inform your OpenAI Apikey")
|
60 |
file_name = input_file.name.split("/")[-1]
|
61 |
results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
|
62 |
# Build results in the correct format for the Gradio front-end
|
requirements.txt
CHANGED
@@ -33,14 +33,22 @@ gradio==3.32.0
|
|
33 |
gradio_client==0.2.5
|
34 |
h11==0.14.0
|
35 |
httpcore==0.17.2
|
|
|
36 |
httpx==0.24.1
|
37 |
huggingface-hub==0.14.1
|
38 |
idna==3.4
|
|
|
39 |
Jinja2==3.1.2
|
|
|
|
|
40 |
jsonschema==4.17.3
|
41 |
kiwisolver==1.4.4
|
42 |
-
langchain==0.
|
|
|
|
|
|
|
43 |
langcodes==3.3.0
|
|
|
44 |
linkify-it-py==2.0.2
|
45 |
lxml==4.9.2
|
46 |
markdown-it-py==2.2.0
|
@@ -60,10 +68,10 @@ necessary==0.4.2
|
|
60 |
networkx==3.1
|
61 |
numexpr==2.8.4
|
62 |
numpy==1.24.3
|
63 |
-
openai==
|
64 |
openapi-schema-pydantic==1.2.4
|
65 |
orjson==3.8.14
|
66 |
-
packaging==23.
|
67 |
pandas==1.5.3
|
68 |
pathy==0.10.1
|
69 |
pdf2image==1.16.3
|
@@ -86,7 +94,7 @@ PyYAML==6.0
|
|
86 |
regex==2023.5.5
|
87 |
requests==2.31.0
|
88 |
requirements-parser==0.5.0
|
89 |
-
scipdf
|
90 |
semantic-version==2.10.0
|
91 |
six==1.16.0
|
92 |
smart-open==6.3.0
|
@@ -103,7 +111,7 @@ tabula-py==2.7.0
|
|
103 |
tenacity==8.2.2
|
104 |
textstat==0.7.3
|
105 |
thinc==8.1.10
|
106 |
-
tiktoken==0.
|
107 |
tokenizers==0.13.3
|
108 |
toolz==0.12.0
|
109 |
torch==2.0.1
|
@@ -112,11 +120,14 @@ transformers==4.29.2
|
|
112 |
typer==0.7.0
|
113 |
types-setuptools==67.8.0.0
|
114 |
typing-inspect==0.9.0
|
115 |
-
typing_extensions==4.
|
116 |
uc-micro-py==1.0.2
|
117 |
-
urllib3==
|
118 |
uvicorn==0.22.0
|
|
|
119 |
Wand==0.6.11
|
120 |
wasabi==1.1.1
|
|
|
121 |
websockets==11.0.3
|
122 |
yarl==1.9.2
|
|
|
|
33 |
gradio_client==0.2.5
|
34 |
h11==0.14.0
|
35 |
httpcore==0.17.2
|
36 |
+
httptools==0.5.0
|
37 |
httpx==0.24.1
|
38 |
huggingface-hub==0.14.1
|
39 |
idna==3.4
|
40 |
+
importlib-resources==6.1.1
|
41 |
Jinja2==3.1.2
|
42 |
+
jsonpatch==1.33
|
43 |
+
jsonpointer==2.4
|
44 |
jsonschema==4.17.3
|
45 |
kiwisolver==1.4.4
|
46 |
+
langchain==0.1.2
|
47 |
+
langchain-community==0.0.14
|
48 |
+
langchain-core==0.1.14
|
49 |
+
langchain-openai==0.0.3
|
50 |
langcodes==3.3.0
|
51 |
+
langsmith==0.0.83
|
52 |
linkify-it-py==2.0.2
|
53 |
lxml==4.9.2
|
54 |
markdown-it-py==2.2.0
|
|
|
68 |
networkx==3.1
|
69 |
numexpr==2.8.4
|
70 |
numpy==1.24.3
|
71 |
+
openai==1.9.0
|
72 |
openapi-schema-pydantic==1.2.4
|
73 |
orjson==3.8.14
|
74 |
+
packaging==23.2
|
75 |
pandas==1.5.3
|
76 |
pathy==0.10.1
|
77 |
pdf2image==1.16.3
|
|
|
94 |
regex==2023.5.5
|
95 |
requests==2.31.0
|
96 |
requirements-parser==0.5.0
|
97 |
+
scipdf==0.1.dev0
|
98 |
semantic-version==2.10.0
|
99 |
six==1.16.0
|
100 |
smart-open==6.3.0
|
|
|
111 |
tenacity==8.2.2
|
112 |
textstat==0.7.3
|
113 |
thinc==8.1.10
|
114 |
+
tiktoken==0.5.2
|
115 |
tokenizers==0.13.3
|
116 |
toolz==0.12.0
|
117 |
torch==2.0.1
|
|
|
120 |
typer==0.7.0
|
121 |
types-setuptools==67.8.0.0
|
122 |
typing-inspect==0.9.0
|
123 |
+
typing_extensions==4.9.0
|
124 |
uc-micro-py==1.0.2
|
125 |
+
urllib3==1.26.6
|
126 |
uvicorn==0.22.0
|
127 |
+
uvloop==0.17.0
|
128 |
Wand==0.6.11
|
129 |
wasabi==1.1.1
|
130 |
+
watchfiles==0.19.0
|
131 |
websockets==11.0.3
|
132 |
yarl==1.9.2
|
133 |
+
zipp==3.17.0
|
src/extractor.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
import openai
|
2 |
import gradio as gr
|
3 |
-
from langchain.embeddings import OpenAIEmbeddings
|
|
|
4 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
5 |
from langchain.vectorstores.faiss import FAISS
|
6 |
from langchain.chains.question_answering import load_qa_chain
|
7 |
from langchain.chains import LLMChain
|
8 |
-
from
|
9 |
-
from langchain import PromptTemplate
|
|
|
10 |
from langchain.docstore.document import Document
|
11 |
import pandas as pd
|
12 |
import os
|
@@ -65,11 +67,9 @@ class Extractor:
|
|
65 |
|
66 |
# Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
|
67 |
def extract_text_from_pdf(self, file_path):
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
except:
|
72 |
-
raise gr.Error("Error parsing PDF, please update your data paper in the correct format")
|
73 |
finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
|
74 |
for section in article_dict['sections']:
|
75 |
sec = section['heading'] + ": "
|
@@ -95,7 +95,7 @@ class Extractor:
|
|
95 |
#table_texts.append(query + " "+ result['text'])
|
96 |
table_texts = await asyncio.gather(*table_texts)
|
97 |
for table in table_texts:
|
98 |
-
docsearch.add_texts(table
|
99 |
return docsearch
|
100 |
|
101 |
def extract_text_clean(self, file_name, file_path):
|
@@ -111,9 +111,8 @@ class Extractor:
|
|
111 |
async def prepare_data(self, file_name, file_path, chain_table, apikey):
|
112 |
# Process text and get the embeddings
|
113 |
vectorspath = "./vectors/"+file_name
|
114 |
-
|
115 |
#apikey = openai.api_key
|
116 |
-
raise gr.Error("Please set your api key")
|
117 |
embeddings = OpenAIEmbeddings(openai_api_key=apikey)
|
118 |
if os.path.isfile(vectorspath+"/index.faiss"):
|
119 |
|
@@ -145,17 +144,19 @@ class Extractor:
|
|
145 |
|
146 |
# Save the index locally
|
147 |
FAISS.save_local(docsearch, "./vectors/"+file_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
return docsearch
|
150 |
|
151 |
def build_chains(self, apikey):
|
152 |
-
|
153 |
-
|
154 |
-
raise gr.Error("Please set your Api key")
|
155 |
-
try:
|
156 |
-
LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
|
157 |
-
except:
|
158 |
-
raise gr.Error("Your Api key is not valid")
|
159 |
## In-context prompt
|
160 |
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
161 |
Question: {question}
|
@@ -192,14 +193,14 @@ class Extractor:
|
|
192 |
|
193 |
async def async_table_generate(self, docs,table,chain):
|
194 |
|
195 |
-
resp = await chain.
|
196 |
#resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
|
197 |
return resp
|
198 |
|
199 |
async def async_generate(self, dimension, docs,question,chain):
|
200 |
-
resp = await chain.
|
201 |
#resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
|
202 |
-
return [dimension, resp]
|
203 |
|
204 |
async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
|
205 |
dimensions = [
|
|
|
1 |
import openai
|
2 |
import gradio as gr
|
3 |
+
#from langchain.embeddings import OpenAIEmbeddings
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
6 |
from langchain.vectorstores.faiss import FAISS
|
7 |
from langchain.chains.question_answering import load_qa_chain
|
8 |
from langchain.chains import LLMChain
|
9 |
+
from langchain_community.llms import OpenAI
|
10 |
+
#from langchain import PromptTemplate
|
11 |
+
from langchain.prompts import PromptTemplate
|
12 |
from langchain.docstore.document import Document
|
13 |
import pandas as pd
|
14 |
import os
|
|
|
67 |
|
68 |
# Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
|
69 |
def extract_text_from_pdf(self, file_path):
|
70 |
+
|
71 |
+
article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
|
72 |
+
print("PDF parsed")
|
|
|
|
|
73 |
finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
|
74 |
for section in article_dict['sections']:
|
75 |
sec = section['heading'] + ": "
|
|
|
95 |
#table_texts.append(query + " "+ result['text'])
|
96 |
table_texts = await asyncio.gather(*table_texts)
|
97 |
for table in table_texts:
|
98 |
+
docsearch.add_texts(table)
|
99 |
return docsearch
|
100 |
|
101 |
def extract_text_clean(self, file_name, file_path):
|
|
|
111 |
async def prepare_data(self, file_name, file_path, chain_table, apikey):
|
112 |
# Process text and get the embeddings
|
113 |
vectorspath = "./vectors/"+file_name
|
114 |
+
|
115 |
#apikey = openai.api_key
|
|
|
116 |
embeddings = OpenAIEmbeddings(openai_api_key=apikey)
|
117 |
if os.path.isfile(vectorspath+"/index.faiss"):
|
118 |
|
|
|
144 |
|
145 |
# Save the index locally
|
146 |
FAISS.save_local(docsearch, "./vectors/"+file_name)
|
147 |
+
|
148 |
+
try:
|
149 |
+
result = docsearch.similarity_search("trial query")
|
150 |
+
except Exception as e:
|
151 |
+
print(e)
|
152 |
+
raise gr.Error("Your OpenAI Apikey is not valid")
|
153 |
+
|
154 |
|
155 |
return docsearch
|
156 |
|
157 |
def build_chains(self, apikey):
|
158 |
+
LLMClient = OpenAI(model_name='gpt-3.5-turbo-instruct',openai_api_key=apikey,temperature=0)
|
159 |
+
|
|
|
|
|
|
|
|
|
|
|
160 |
## In-context prompt
|
161 |
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
162 |
Question: {question}
|
|
|
193 |
|
194 |
async def async_table_generate(self, docs,table,chain):
|
195 |
|
196 |
+
resp = await chain.ainvoke({"context": docs, "table": table})
|
197 |
#resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
|
198 |
return resp
|
199 |
|
200 |
async def async_generate(self, dimension, docs,question,chain):
|
201 |
+
resp = await chain.ainvoke({"input_documents": docs, "question": question})
|
202 |
#resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
|
203 |
+
return [dimension, resp['output_text']]
|
204 |
|
205 |
async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
|
206 |
dimensions = [
|