eljanmahammadli commited on
Commit
fa3e7dd
·
1 Parent(s): 8c8c07f

#feat added simplest scholar mode

Browse files
Files changed (2) hide show
  1. app.py +59 -29
  2. google_search.py +3 -5
app.py CHANGED
@@ -21,9 +21,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
21
  from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
- from humanize import humanize_text, device
25
- # humanize_text = None
26
- # device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
28
  from google_search import google_search, months, domain_list, build_date
29
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
@@ -796,6 +796,24 @@ def save_humanizer_feedback_to_cloud_storage(data, humanizer_feedback):
796
  gr.Warning("Nothing humanized to save yet!")
797
 
798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  def generate_and_format(
800
  input_role,
801
  topic,
@@ -812,6 +830,7 @@ def generate_and_format(
812
  num_examples,
813
  conclusion_type,
814
  google_search_check,
 
815
  year_from,
816
  month_from,
817
  day_from,
@@ -838,14 +857,18 @@ def generate_and_format(
838
  final_query = llm_wrapper(
839
  input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
840
  )
841
- if include_sites:
842
- site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
843
- final_query += " " + " OR ".join(site_queries)
844
- if exclude_sites:
845
- exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
846
- final_query += " " + " ".join(exclude_queries)
 
 
 
 
847
  print(f"Google Search Query: {final_query}")
848
- url_content = google_search(final_query, sorted_date, domains_to_include)
849
 
850
  yt_content = {}
851
  if yt_url:
@@ -1077,24 +1100,27 @@ with gr.Blocks(
1077
  )
1078
  with gr.Group(visible=google_default) as search_options:
1079
  with gr.Row():
1080
- include_sites = gr.Textbox(
1081
- label="Include Specific Websites",
1082
- placeholder="Enter comma-separated keywords",
1083
- elem_classes="input-highlight-yellow",
1084
- )
1085
- with gr.Row():
1086
- exclude_sites = gr.Textbox(
1087
- label="Exclude Specific Websites",
1088
- placeholder="Enter comma-separated keywords",
1089
- elem_classes="input-highlight-yellow",
1090
- )
1091
- with gr.Row():
1092
- domains_to_include = gr.Dropdown(
1093
- domain_list,
1094
- value=domain_list,
1095
- multiselect=True,
1096
- label="Domains To Include",
1097
- )
 
 
 
1098
  with gr.Row():
1099
  month_from = gr.Dropdown(
1100
  choices=months,
@@ -1224,7 +1250,9 @@ with gr.Blocks(
1224
  else:
1225
  return gr.update(visible=False)
1226
 
1227
- google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
 
 
1228
  # ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
1229
  # output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
1230
  # ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
@@ -1257,6 +1285,7 @@ with gr.Blocks(
1257
  # ai_generator,
1258
  # input_api,
1259
  google_search_check,
 
1260
  year_from,
1261
  month_from,
1262
  day_from,
@@ -1293,6 +1322,7 @@ with gr.Blocks(
1293
  # ai_generator,
1294
  # input_api,
1295
  google_search_check,
 
1296
  year_from,
1297
  month_from,
1298
  day_from,
 
21
  from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
+ # from humanize import humanize_text, device
25
+ humanize_text = None
26
+ device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
28
  from google_search import google_search, months, domain_list, build_date
29
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
 
796
  gr.Warning("Nothing humanized to save yet!")
797
 
798
 
799
+ scholar_urls = [
800
+ "arxiv.org",
801
+ "aclanthology.org",
802
+ "ieeexplore.ieee.org",
803
+ "researchgate.net",
804
+ # "scholar.google.com",
805
+ "springer.com",
806
+ # "sciencedirect.com", # 400
807
+ # "onlinelibrary.wiley.com", # 400
808
+ "jstor.org", # 400
809
+ "semanticscholar.org",
810
+ "biorxiv.org",
811
+ "medrxiv.org",
812
+ "ssrn.com",
813
+ "pubmed.ncbi.nlm.nih.gov",
814
+ "cochranelibrary.com",
815
+ ]
816
+
817
  def generate_and_format(
818
  input_role,
819
  topic,
 
830
  num_examples,
831
  conclusion_type,
832
  google_search_check,
833
+ scholar_mode_check,
834
  year_from,
835
  month_from,
836
  day_from,
 
857
  final_query = llm_wrapper(
858
  input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
859
  )
860
+ if scholar_mode_check:
861
+ scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
862
+ final_query += " " + " OR ".join(scholar_site_queries)
863
+ else:
864
+ if include_sites:
865
+ site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
866
+ final_query += " " + " OR ".join(site_queries)
867
+ if exclude_sites:
868
+ exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
869
+ final_query += " " + " ".join(exclude_queries)
870
  print(f"Google Search Query: {final_query}")
871
+ url_content = google_search(final_query, sorted_date, domains_to_include, scholar_mode_check)
872
 
873
  yt_content = {}
874
  if yt_url:
 
1100
  )
1101
  with gr.Group(visible=google_default) as search_options:
1102
  with gr.Row():
1103
+ scholar_mode_check = gr.Checkbox(label="Enable Scholar Mode", value=False)
1104
+ with gr.Group(visible=True) as site_options:
1105
+ with gr.Row():
1106
+ include_sites = gr.Textbox(
1107
+ label="Include Specific Websites",
1108
+ placeholder="Enter comma-separated keywords",
1109
+ elem_classes="input-highlight-yellow",
1110
+ )
1111
+ with gr.Row():
1112
+ exclude_sites = gr.Textbox(
1113
+ label="Exclude Specific Websites",
1114
+ placeholder="Enter comma-separated keywords",
1115
+ elem_classes="input-highlight-yellow",
1116
+ )
1117
+ with gr.Row():
1118
+ domains_to_include = gr.Dropdown(
1119
+ domain_list,
1120
+ value=domain_list,
1121
+ multiselect=True,
1122
+ label="Domains To Include",
1123
+ )
1124
  with gr.Row():
1125
  month_from = gr.Dropdown(
1126
  choices=months,
 
1250
  else:
1251
  return gr.update(visible=False)
1252
 
1253
+ google_search_check.change(
1254
+ lambda toggle: gr.update(visible=toggle), inputs=google_search_check, outputs=search_options
1255
+ )
1256
  # ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
1257
  # output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
1258
  # ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
 
1285
  # ai_generator,
1286
  # input_api,
1287
  google_search_check,
1288
+ scholar_mode_check,
1289
  year_from,
1290
  month_from,
1291
  day_from,
 
1322
  # ai_generator,
1323
  # input_api,
1324
  google_search_check,
1325
+ scholar_mode_check,
1326
  year_from,
1327
  month_from,
1328
  day_from,
google_search.py CHANGED
@@ -193,14 +193,12 @@ def google_search_urls(
193
  return url_list
194
 
195
 
196
- def google_search(
197
- topic,
198
- sorted_date,
199
- domains_to_include,
200
- ):
201
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
202
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
203
  start_time = time.perf_counter()
 
 
204
  url_list = google_search_urls(
205
  topic,
206
  sorted_date,
 
193
  return url_list
194
 
195
 
196
+ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
 
 
 
 
197
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
198
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
199
  start_time = time.perf_counter()
200
+ if scholar_mode_check:
201
+ topic += " -filetype:pdf"
202
  url_list = google_search_urls(
203
  topic,
204
  sorted_date,