gamingflexer commited on
Commit
5718d42
·
1 Parent(s): 2f67f06

Add unique authors dataframe to the plagiarism checker

Browse files
Files changed (1) hide show
  1. src/app.py +16 -15
src/app.py CHANGED
@@ -3,15 +3,10 @@ import pandas as pd
3
  import logging
4
  from scrapper.main import ArxivPaper
5
  from config import *
6
- from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data
7
  from utils import compare_paper_ids
8
 
9
- """
10
- author_obj = ArxivPaper("Andrew Ng")
11
- paper_links = author_obj.get_results_google(number_of_results=25)
12
- paper_ids = author_obj.get_paper_id(paper_links)
13
- author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers")
14
- """
15
 
16
  def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
17
  number_of_results_fetch = int(number_of_results_fetch)
@@ -21,26 +16,29 @@ def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.P
21
  db_author_name = get_correct_author_name(authors_name_fetch)
22
  paper_links = author_obj.get_results_google(number_of_results=number_of_results_fetch)
23
  paper_ids = author_obj.get_paper_id(paper_links)
24
- progress(0.4, desc="Collecting Papers")
25
  if db_author_name is None:
 
 
26
  print("No similar author found in the database")
27
- author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers")
28
  local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_"))
29
  progress(0.6, desc="Making summary")
30
  data_to_save = []
31
- for paper in os.listdir(local_saved_papers):
32
- paper_path = os.path.join(local_saved_papers, paper)
33
- with open(paper_path, "r") as f:
34
- data_to_save.append(f.read())
 
 
35
  else:
36
  print(f"Found similar author in the database: {db_author_name}")
37
  data = fetch_papers_data(db_author_name)
38
  remaining_paper_ids = compare_paper_ids(data,paper_ids)
 
39
  progress(0.6, desc="Making summary")
40
  data_to_save = []
41
  if remaining_paper_ids != []:
42
  author_obj.get_paper_details_batch(paper_ids=remaining_paper_ids, path="./data/papers")
43
- local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_"))
44
  for paper in os.listdir(local_saved_papers):
45
  paper_path = os.path.join(local_saved_papers, paper)
46
  with open(paper_path, "r") as f:
@@ -50,7 +48,7 @@ def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.P
50
 
51
  progress(0.8, desc="Saving to Database")
52
  insert_papers_data(data_to_save, authors_name_fetch)
53
- return "Fetched Latest Papers"
54
 
55
  def fetch_papers_data_df(authors_name: str, progress=gr.Progress()):
56
  progress(0.2, desc="Fetching Papers")
@@ -70,6 +68,9 @@ with gr.Blocks() as demo:
70
  dataframe_output = gr.Dataframe(headers=['doi_no', 'author_name', 'title', 'authors', 'year', 'pdf_link',
71
  'references', 'categories', 'comment', 'journal_ref', 'source',
72
  'summary', 'published'])
 
 
 
73
 
74
  with gr.Tab("Arxiv Plagiarism Fetcher & Save to DB"):
75
  with gr.Row():
 
3
  import logging
4
  from scrapper.main import ArxivPaper
5
  from config import *
6
+ from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors
7
  from utils import compare_paper_ids
8
 
9
+ unique_authors_df = get_unquine_authors()
 
 
 
 
 
10
 
11
  def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
12
  number_of_results_fetch = int(number_of_results_fetch)
 
16
  db_author_name = get_correct_author_name(authors_name_fetch)
17
  paper_links = author_obj.get_results_google(number_of_results=number_of_results_fetch)
18
  paper_ids = author_obj.get_paper_id(paper_links)
19
+ progress(0.4, desc=f"Collecting Papers for {len(paper_ids)}")
20
  if db_author_name is None:
21
+ data = fetch_papers_data(db_author_name)
22
+ remaining_paper_ids = compare_paper_ids(data,paper_ids)
23
  print("No similar author found in the database")
 
24
  local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_"))
25
  progress(0.6, desc="Making summary")
26
  data_to_save = []
27
+ if remaining_paper_ids != []:
28
+ author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers")
29
+ for paper in os.listdir(local_saved_papers):
30
+ paper_path = os.path.join(local_saved_papers, paper)
31
+ with open(paper_path, "r") as f:
32
+ data_to_save.append(f.read())
33
  else:
34
  print(f"Found similar author in the database: {db_author_name}")
35
  data = fetch_papers_data(db_author_name)
36
  remaining_paper_ids = compare_paper_ids(data,paper_ids)
37
+ local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_"))
38
  progress(0.6, desc="Making summary")
39
  data_to_save = []
40
  if remaining_paper_ids != []:
41
  author_obj.get_paper_details_batch(paper_ids=remaining_paper_ids, path="./data/papers")
 
42
  for paper in os.listdir(local_saved_papers):
43
  paper_path = os.path.join(local_saved_papers, paper)
44
  with open(paper_path, "r") as f:
 
48
 
49
  progress(0.8, desc="Saving to Database")
50
  insert_papers_data(data_to_save, authors_name_fetch)
51
+ return f"Fetched Latest Papers for {len(remaining_paper_ids)}"
52
 
53
  def fetch_papers_data_df(authors_name: str, progress=gr.Progress()):
54
  progress(0.2, desc="Fetching Papers")
 
68
  dataframe_output = gr.Dataframe(headers=['doi_no', 'author_name', 'title', 'authors', 'year', 'pdf_link',
69
  'references', 'categories', 'comment', 'journal_ref', 'source',
70
  'summary', 'published'])
71
+ with gr.Row():
72
+ unquine_authors_output = gr.Dataframe(headers=["author_name"],value=unique_authors_df, label=" Authors Currently in our DB")
73
+
74
 
75
  with gr.Tab("Arxiv Plagiarism Fetcher & Save to DB"):
76
  with gr.Row():