Spaces:
Build error
Build error
temp dir for each survey
Browse files- app.py +26 -19
- src/Surveyor.py +36 -41
app.py
CHANGED
@@ -9,25 +9,32 @@ from pathlib import Path
|
|
9 |
from src.Surveyor import Surveyor
|
10 |
|
11 |
|
12 |
-
|
13 |
-
def get_surveyor_instance(
|
14 |
with st.spinner('Loading The-Researcher ...'):
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
|
@@ -81,7 +88,7 @@ if __name__ == '__main__':
|
|
81 |
submit = st.form_submit_button(label="Submit")
|
82 |
st.sidebar.write('#### execution log:')
|
83 |
|
84 |
-
run_kwargs = {'
|
85 |
'download_placeholder':download_placeholder}
|
86 |
if submit:
|
87 |
if session_data['research_keywords'] != '':
|
|
|
9 |
from src.Surveyor import Surveyor
|
10 |
|
11 |
|
12 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
13 |
+
def get_surveyor_instance(_print_fn, _survey_print_fn):
|
14 |
with st.spinner('Loading The-Researcher ...'):
|
15 |
+
return Surveyor(print_fn=_print_fn, survey_print_fn=_survey_print_fn, high_gpu=True)
|
16 |
+
|
17 |
+
|
18 |
+
def run_survey(surveyor, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
|
19 |
+
import hashlib
|
20 |
+
import time
|
21 |
+
|
22 |
+
hash = hashlib.sha1()
|
23 |
+
hash.update(str(time.time()))
|
24 |
+
temp_hash = hash.hexdigest()
|
25 |
+
survey_root = Path(temp_hash).resolve()
|
26 |
+
dir_args = {f'{dname}_dir': survey_root / dname for dname in ['pdf', 'txt', 'img', 'tab', 'dump']}
|
27 |
+
for d in dir_args.values():
|
28 |
+
d.mkdir(exist_ok=True, parents=True)
|
29 |
+
print(survey_root)
|
30 |
+
print(dir_args)
|
31 |
+
dir_args = {k: str(v.resolve()) for k, v in dir_args.items()}
|
32 |
+
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
33 |
+
arxiv_ids,
|
34 |
+
max_search=max_search,
|
35 |
+
num_papers=num_papers
|
36 |
+
**dir_args)
|
37 |
+
show_survey_download(zip_file_name, survey_file_name, download_placeholder)
|
38 |
|
39 |
|
40 |
def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
|
|
|
88 |
submit = st.form_submit_button(label="Submit")
|
89 |
st.sidebar.write('#### execution log:')
|
90 |
|
91 |
+
run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
|
92 |
'download_placeholder':download_placeholder}
|
93 |
if submit:
|
94 |
if session_data['research_keywords'] != '':
|
src/Surveyor.py
CHANGED
@@ -30,11 +30,6 @@ class Surveyor:
|
|
30 |
|
31 |
def __init__(
|
32 |
self,
|
33 |
-
pdf_dir=None,
|
34 |
-
txt_dir=None,
|
35 |
-
img_dir=None,
|
36 |
-
tab_dir=None,
|
37 |
-
dump_dir=None,
|
38 |
models_dir=None,
|
39 |
title_model_name=None,
|
40 |
ex_summ_model_name=None,
|
@@ -53,11 +48,6 @@ class Surveyor:
|
|
53 |
Initializes models and directory structure for the surveyor
|
54 |
|
55 |
Optional Params:
|
56 |
-
- pdf_dir: String, pdf paper storage directory - defaults to arxiv_data/tarpdfs/
|
57 |
-
- txt_dir: String, text-converted paper storage directory - defaults to arxiv_data/fulltext/
|
58 |
-
- img_dir: String, image image storage directory - defaults to arxiv_data/images/
|
59 |
-
- tab_dir: String, tables storage directory - defaults to arxiv_data/tables/
|
60 |
-
- dump_dir: String, all_output_dir - defaults to arxiv_dumps/
|
61 |
- models_dir: String, directory to save to huge models
|
62 |
- title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
63 |
- ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
@@ -192,41 +182,41 @@ class Surveyor:
|
|
192 |
self.similarity_nlp = spacy.load(similarity_nlp_name)
|
193 |
self.kw_model = KeyBERT(kw_model_name)
|
194 |
|
195 |
-
self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
|
196 |
|
197 |
def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
198 |
|
199 |
if pdf_dir:
|
200 |
-
|
201 |
else:
|
202 |
-
|
203 |
|
204 |
if txt_dir:
|
205 |
-
|
206 |
else:
|
207 |
-
|
208 |
|
209 |
if img_dir:
|
210 |
-
|
211 |
else:
|
212 |
-
|
213 |
|
214 |
if tab_dir:
|
215 |
-
|
216 |
else:
|
217 |
-
|
218 |
|
219 |
if dump_dir:
|
220 |
-
|
221 |
else:
|
222 |
-
|
223 |
|
224 |
-
dirs = [
|
225 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
226 |
base = os.path.dirname("arxiv_data/")
|
227 |
if not os.path.exists(base):
|
228 |
os.mkdir(base)
|
229 |
self.clean_dirs(dirs)
|
|
|
230 |
|
231 |
def clean_dirs(self, dirs):
|
232 |
import shutil
|
@@ -1345,9 +1335,14 @@ class Surveyor:
|
|
1345 |
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
|
1346 |
zipdir(dump_dir, zipf)
|
1347 |
|
1348 |
-
def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False
|
|
|
1349 |
import joblib
|
1350 |
import os, shutil
|
|
|
|
|
|
|
|
|
1351 |
if not max_search:
|
1352 |
max_search = self.DEFAULTS['max_search']
|
1353 |
if not num_papers:
|
@@ -1357,39 +1352,39 @@ class Surveyor:
|
|
1357 |
# arxiv api relevance search and data preparation
|
1358 |
self.print_fn("\n- searching arXiv for top 100 papers.. ")
|
1359 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
1360 |
-
joblib.dump(searched_papers,
|
1361 |
self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
|
1362 |
|
1363 |
# paper selection by scibert vector embedding relevance scores
|
1364 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
1365 |
|
1366 |
-
papers_highlighted, papers_selected, cites = self.pdf_route(
|
1367 |
searched_papers)
|
1368 |
|
1369 |
if weigh_authors:
|
1370 |
authors = self.author_stats(papers_highlighted)
|
1371 |
|
1372 |
-
joblib.dump(papers_highlighted,
|
1373 |
|
1374 |
self.print_fn("\n- Standardizing known section headings per paper.. ")
|
1375 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1376 |
-
joblib.dump(papers_standardized,
|
1377 |
|
1378 |
self.print_fn("\n- Building paper-wise corpus.. ")
|
1379 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1380 |
-
joblib.dump(corpus,
|
1381 |
|
1382 |
self.print_fn("\n- Building section-wise corpus.. ")
|
1383 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1384 |
-
joblib.dump(corpus_sectionwise,
|
1385 |
|
1386 |
self.print_fn("\n- Building basic research highlights.. ")
|
1387 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1388 |
-
joblib.dump(research_blocks,
|
1389 |
|
1390 |
self.print_fn("\n- Reducing corpus to lines.. ")
|
1391 |
corpus_lines = self.get_corpus_lines(corpus)
|
1392 |
-
joblib.dump(corpus_lines,
|
1393 |
|
1394 |
# temp
|
1395 |
# searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
|
@@ -1423,7 +1418,7 @@ class Surveyor:
|
|
1423 |
|
1424 |
self.print_fn("\n- Building abstract.. ")
|
1425 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1426 |
-
joblib.dump(abstract_block,
|
1427 |
'''
|
1428 |
self.print_fn("abstract_block type:"+ str(type(abstract_block)))
|
1429 |
self.print_fn("abstract_block:")
|
@@ -1432,7 +1427,7 @@ class Surveyor:
|
|
1432 |
|
1433 |
self.print_fn("\n- Building introduction.. ")
|
1434 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1435 |
-
joblib.dump(intro_block,
|
1436 |
'''
|
1437 |
self.print_fn("intro_block type:"+ str(type(intro_block)))
|
1438 |
self.print_fn("intro_block:")
|
@@ -1440,8 +1435,8 @@ class Surveyor:
|
|
1440 |
'''
|
1441 |
self.print_fn("\n- Building custom sections.. ")
|
1442 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1443 |
-
joblib.dump(clustered_sections,
|
1444 |
-
joblib.dump(clustered_sentences,
|
1445 |
|
1446 |
'''
|
1447 |
self.print_fn("clusters extracted")
|
@@ -1454,11 +1449,11 @@ class Surveyor:
|
|
1454 |
'''
|
1455 |
clustered_sections['abstract'] = abstract_block
|
1456 |
clustered_sections['introduction'] = intro_block
|
1457 |
-
joblib.dump(clustered_sections,
|
1458 |
|
1459 |
self.print_fn("\n- Building conclusion.. ")
|
1460 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1461 |
-
joblib.dump(conclusion_block,
|
1462 |
clustered_sections['conclusion'] = conclusion_block
|
1463 |
'''
|
1464 |
self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
|
@@ -1469,18 +1464,18 @@ class Surveyor:
|
|
1469 |
query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
|
1470 |
|
1471 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
1472 |
-
survey_file = Path(
|
1473 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
|
1474 |
|
1475 |
self.survey_print_fn("\n-citation-network: ")
|
1476 |
self.survey_print_fn(cites)
|
1477 |
|
1478 |
-
shutil.copytree('arxiv_data/',
|
1479 |
assert (os.path.exists(survey_file))
|
1480 |
|
1481 |
zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
|
1482 |
-
zip_name = Path(
|
1483 |
-
self.zip_outputs(
|
1484 |
self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
|
1485 |
"\nAll outputs zip path :" + str(zip_name))
|
1486 |
|
|
|
30 |
|
31 |
def __init__(
|
32 |
self,
|
|
|
|
|
|
|
|
|
|
|
33 |
models_dir=None,
|
34 |
title_model_name=None,
|
35 |
ex_summ_model_name=None,
|
|
|
48 |
Initializes models and directory structure for the surveyor
|
49 |
|
50 |
Optional Params:
|
|
|
|
|
|
|
|
|
|
|
51 |
- models_dir: String, directory to save to huge models
|
52 |
- title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
53 |
- ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
|
|
182 |
self.similarity_nlp = spacy.load(similarity_nlp_name)
|
183 |
self.kw_model = KeyBERT(kw_model_name)
|
184 |
|
|
|
185 |
|
186 |
def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
187 |
|
188 |
if pdf_dir:
|
189 |
+
survey_pdf_dir = pdf_dir
|
190 |
else:
|
191 |
+
survey_pdf_dir = self.DEFAULTS["pdf_dir"]
|
192 |
|
193 |
if txt_dir:
|
194 |
+
survey_txt_dir = txt_dir
|
195 |
else:
|
196 |
+
survey_txt_dir = self.DEFAULTS["txt_dir"]
|
197 |
|
198 |
if img_dir:
|
199 |
+
survey_img_dir = img_dir
|
200 |
else:
|
201 |
+
survey_img_dir = self.DEFAULTS["img_dir"]
|
202 |
|
203 |
if tab_dir:
|
204 |
+
survey_tab_dir = tab_dir
|
205 |
else:
|
206 |
+
survey_tab_dir = self.DEFAULTS["tab_dir"]
|
207 |
|
208 |
if dump_dir:
|
209 |
+
survey_dump_dir = dump_dir
|
210 |
else:
|
211 |
+
survey_dump_dir = self.DEFAULTS["dump_dir"]
|
212 |
|
213 |
+
dirs = [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir]
|
214 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
215 |
base = os.path.dirname("arxiv_data/")
|
216 |
if not os.path.exists(base):
|
217 |
os.mkdir(base)
|
218 |
self.clean_dirs(dirs)
|
219 |
+
return dirs
|
220 |
|
221 |
def clean_dirs(self, dirs):
|
222 |
import shutil
|
|
|
1335 |
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
|
1336 |
zipdir(dump_dir, zipf)
|
1337 |
|
1338 |
+
def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False,
|
1339 |
+
pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
1340 |
import joblib
|
1341 |
import os, shutil
|
1342 |
+
|
1343 |
+
dirs = self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
|
1344 |
+
[survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir] = dirs
|
1345 |
+
|
1346 |
if not max_search:
|
1347 |
max_search = self.DEFAULTS['max_search']
|
1348 |
if not num_papers:
|
|
|
1352 |
# arxiv api relevance search and data preparation
|
1353 |
self.print_fn("\n- searching arXiv for top 100 papers.. ")
|
1354 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
1355 |
+
joblib.dump(searched_papers, survey_dump_dir + 'papers_metadata.dmp')
|
1356 |
self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
|
1357 |
|
1358 |
# paper selection by scibert vector embedding relevance scores
|
1359 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
1360 |
|
1361 |
+
papers_highlighted, papers_selected, cites = self.pdf_route(survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir,
|
1362 |
searched_papers)
|
1363 |
|
1364 |
if weigh_authors:
|
1365 |
authors = self.author_stats(papers_highlighted)
|
1366 |
|
1367 |
+
joblib.dump(papers_highlighted, survey_dump_dir + 'papers_highlighted.dmp')
|
1368 |
|
1369 |
self.print_fn("\n- Standardizing known section headings per paper.. ")
|
1370 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1371 |
+
joblib.dump(papers_standardized, survey_dump_dir + 'papers_standardized.dmp')
|
1372 |
|
1373 |
self.print_fn("\n- Building paper-wise corpus.. ")
|
1374 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1375 |
+
joblib.dump(corpus, survey_dump_dir + 'corpus.dmp')
|
1376 |
|
1377 |
self.print_fn("\n- Building section-wise corpus.. ")
|
1378 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1379 |
+
joblib.dump(corpus_sectionwise, survey_dump_dir + 'corpus_sectionwise.dmp')
|
1380 |
|
1381 |
self.print_fn("\n- Building basic research highlights.. ")
|
1382 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1383 |
+
joblib.dump(research_blocks, survey_dump_dir + 'research_blocks.dmp')
|
1384 |
|
1385 |
self.print_fn("\n- Reducing corpus to lines.. ")
|
1386 |
corpus_lines = self.get_corpus_lines(corpus)
|
1387 |
+
joblib.dump(corpus_lines, survey_dump_dir + 'corpus_lines.dmp')
|
1388 |
|
1389 |
# temp
|
1390 |
# searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
|
|
|
1418 |
|
1419 |
self.print_fn("\n- Building abstract.. ")
|
1420 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1421 |
+
joblib.dump(abstract_block, survey_dump_dir + 'abstract_block.dmp')
|
1422 |
'''
|
1423 |
self.print_fn("abstract_block type:"+ str(type(abstract_block)))
|
1424 |
self.print_fn("abstract_block:")
|
|
|
1427 |
|
1428 |
self.print_fn("\n- Building introduction.. ")
|
1429 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1430 |
+
joblib.dump(intro_block, survey_dump_dir + 'intro_block.dmp')
|
1431 |
'''
|
1432 |
self.print_fn("intro_block type:"+ str(type(intro_block)))
|
1433 |
self.print_fn("intro_block:")
|
|
|
1435 |
'''
|
1436 |
self.print_fn("\n- Building custom sections.. ")
|
1437 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1438 |
+
joblib.dump(clustered_sections, survey_dump_dir + 'clustered_sections.dmp')
|
1439 |
+
joblib.dump(clustered_sentences, survey_dump_dir + 'clustered_sentences.dmp')
|
1440 |
|
1441 |
'''
|
1442 |
self.print_fn("clusters extracted")
|
|
|
1449 |
'''
|
1450 |
clustered_sections['abstract'] = abstract_block
|
1451 |
clustered_sections['introduction'] = intro_block
|
1452 |
+
joblib.dump(clustered_sections, survey_dump_dir + 'research_sections.dmp')
|
1453 |
|
1454 |
self.print_fn("\n- Building conclusion.. ")
|
1455 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1456 |
+
joblib.dump(conclusion_block, survey_dump_dir + 'conclusion_block.dmp')
|
1457 |
clustered_sections['conclusion'] = conclusion_block
|
1458 |
'''
|
1459 |
self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
|
|
|
1464 |
query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
|
1465 |
|
1466 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
1467 |
+
survey_file = Path(survey_dump_dir).resolve() / survey_file
|
1468 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
|
1469 |
|
1470 |
self.survey_print_fn("\n-citation-network: ")
|
1471 |
self.survey_print_fn(cites)
|
1472 |
|
1473 |
+
shutil.copytree('arxiv_data/', survey_dump_dir + '/arxiv_data/')
|
1474 |
assert (os.path.exists(survey_file))
|
1475 |
|
1476 |
zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
|
1477 |
+
zip_name = Path(survey_dump_dir).parent.resolve() / zip_name
|
1478 |
+
self.zip_outputs(survey_dump_dir, str(zip_name))
|
1479 |
self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
|
1480 |
"\nAll outputs zip path :" + str(zip_name))
|
1481 |
|