aliasgerovs commited on
Commit
c45480c
·
2 Parent(s): a32fa53 f14cff1

Merge branch 'minko'

Browse files
Files changed (2) hide show
  1. app.py +41 -26
  2. plagiarism.py +18 -9
app.py CHANGED
@@ -218,11 +218,9 @@ def ai_check(text: str, option: str):
218
 
219
 
220
  def generate_prompt(settings: Dict[str, str]) -> str:
221
- content_string = "\n".join(
222
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
223
- )
224
-
225
  prompt = f"""
 
 
226
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
227
 
228
  Style and Tone:
@@ -243,9 +241,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
243
  - End with a {settings['conclusion_type']} conclusion
244
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
245
  - Do not make any headline, title bold.
246
-
247
- Use the content here from the URLs I've found for you:
248
- {content_string}
249
 
250
  Ensure proper paragraph breaks for better readability.
251
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -254,11 +250,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
254
 
255
 
256
  def regenerate_prompt(settings: Dict[str, str]) -> str:
257
- content_string = "\n".join(
258
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
259
- )
260
-
261
  prompt = f"""
 
 
262
  "{settings['generated_article']}"
263
 
264
  Edit the given text based on user comments.
@@ -268,8 +262,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
268
  - The original content should not be changed. Make minor modifications based on user comments above.
269
  - Keep the references the same as the given text in the same format.
270
  - Do not make any headline, title bold.
271
- Use the content here from the URLs I've found for you:
272
- {content_string}
273
 
274
  Ensure proper paragraph breaks for better readability.
275
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -278,6 +271,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
278
 
279
 
280
  def generate_article(
 
281
  topic: str,
282
  keywords: str,
283
  article_length: str,
@@ -291,15 +285,13 @@ def generate_article(
291
  num_examples: str,
292
  conclusion_type: str,
293
  ai_model: str,
294
- sorted_date,
295
- domains_to_skip,
296
  api_key: str = None,
297
  generated_article: str = None,
298
  user_comments: str = None,
299
  ) -> str:
300
-
301
- url_content = google_search(topic, sorted_date, domains_to_skip)
302
  settings = {
 
303
  "topic": topic,
304
  "keywords": [k.strip() for k in keywords.split(",")],
305
  "article_length": article_length,
@@ -312,7 +304,7 @@ def generate_article(
312
  "references": [r.strip() for r in references.split(",")],
313
  "num_examples": num_examples,
314
  "conclusion_type": conclusion_type,
315
- "sources": url_content,
316
  "generated_article": generated_article,
317
  "user_comments": user_comments,
318
  }
@@ -378,7 +370,11 @@ def format_references(text: str) -> str:
378
  in_references = False
379
 
380
  for line in lines:
381
- if line.strip().lower() == "references" or line.strip().lower() == "references:":
 
 
 
 
382
  in_references = True
383
  continue
384
  if in_references:
@@ -395,6 +391,7 @@ def format_references(text: str) -> str:
395
 
396
 
397
  def generate_and_format(
 
398
  topic,
399
  keywords,
400
  article_length,
@@ -409,20 +406,29 @@ def generate_and_format(
409
  conclusion_type,
410
  ai_model,
411
  api_key,
 
412
  year_from,
413
  month_from,
414
  day_from,
415
  year_to,
416
  month_to,
417
  day_to,
418
- domains_to_skip,
419
  generated_article: str = None,
420
  user_comments: str = None,
421
  ):
422
  date_from = build_date(year_from, month_from, day_from)
423
  date_to = build_date(year_to, month_to, day_to)
424
  sorted_date = f"date:r:{date_from}:{date_to}"
 
 
 
 
 
 
 
425
  article = generate_article(
 
426
  topic,
427
  keywords,
428
  article_length,
@@ -436,9 +442,8 @@ def generate_and_format(
436
  num_examples,
437
  conclusion_type,
438
  ai_model,
 
439
  api_key,
440
- sorted_date,
441
- domains_to_skip,
442
  generated_article,
443
  user_comments,
444
  )
@@ -464,6 +469,7 @@ def create_interface():
464
  with gr.Column(scale=2):
465
  with gr.Group():
466
  gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
 
467
  input_topic = gr.Textbox(
468
  label="Topic",
469
  placeholder="Enter the main topic of your article",
@@ -584,6 +590,10 @@ def create_interface():
584
  )
585
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
586
  with gr.Group():
 
 
 
 
587
  with gr.Row():
588
  month_from = gr.Dropdown(
589
  choices=months,
@@ -605,10 +615,11 @@ def create_interface():
605
  year_to = gr.Textbox(label="To Year", value=d1[2])
606
 
607
  with gr.Row():
608
- domains_to_skip = gr.Dropdown(
609
  domain_list,
 
610
  multiselect=True,
611
- label="Domain To Skip",
612
  )
613
 
614
  with gr.Group():
@@ -690,6 +701,7 @@ def create_interface():
690
  generate_btn.click(
691
  fn=generate_and_format,
692
  inputs=[
 
693
  input_topic,
694
  input_keywords,
695
  input_length,
@@ -704,13 +716,14 @@ def create_interface():
704
  input_conclusion,
705
  ai_generator,
706
  input_api,
 
707
  year_from,
708
  month_from,
709
  day_from,
710
  year_to,
711
  month_to,
712
  day_to,
713
- domains_to_skip,
714
  ],
715
  outputs=[output_article],
716
  )
@@ -718,6 +731,7 @@ def create_interface():
718
  regenerate_btn.click(
719
  fn=generate_and_format,
720
  inputs=[
 
721
  input_topic,
722
  input_keywords,
723
  input_length,
@@ -732,13 +746,14 @@ def create_interface():
732
  input_conclusion,
733
  ai_generator,
734
  input_api,
 
735
  year_from,
736
  month_from,
737
  day_from,
738
  year_to,
739
  month_to,
740
  day_to,
741
- domains_to_skip,
742
  output_article,
743
  ai_comments,
744
  ],
 
218
 
219
 
220
  def generate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
221
  prompt = f"""
222
+ I am a {settings['role']}
223
+
224
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
225
 
226
  Style and Tone:
 
241
  - End with a {settings['conclusion_type']} conclusion
242
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
243
  - Do not make any headline, title bold.
244
+ {settings['sources']}
 
 
245
 
246
  Ensure proper paragraph breaks for better readability.
247
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
250
 
251
 
252
  def regenerate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
253
  prompt = f"""
254
+ I am a {settings['role']}
255
+
256
  "{settings['generated_article']}"
257
 
258
  Edit the given text based on user comments.
 
262
  - The original content should not be changed. Make minor modifications based on user comments above.
263
  - Keep the references the same as the given text in the same format.
264
  - Do not make any headline, title bold.
265
+ {settings['sources']}
 
266
 
267
  Ensure proper paragraph breaks for better readability.
268
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
271
 
272
 
273
  def generate_article(
274
+ input_role: str,
275
  topic: str,
276
  keywords: str,
277
  article_length: str,
 
285
  num_examples: str,
286
  conclusion_type: str,
287
  ai_model: str,
288
+ content_string: str,
 
289
  api_key: str = None,
290
  generated_article: str = None,
291
  user_comments: str = None,
292
  ) -> str:
 
 
293
  settings = {
294
+ "role": input_role,
295
  "topic": topic,
296
  "keywords": [k.strip() for k in keywords.split(",")],
297
  "article_length": article_length,
 
304
  "references": [r.strip() for r in references.split(",")],
305
  "num_examples": num_examples,
306
  "conclusion_type": conclusion_type,
307
+ "sources": content_string,
308
  "generated_article": generated_article,
309
  "user_comments": user_comments,
310
  }
 
370
  in_references = False
371
 
372
  for line in lines:
373
+ if (
374
+ line.strip().lower() == "references"
375
+ or line.strip().lower() == "references:"
376
+ or line.strip().lower().startswith("references:")
377
+ ):
378
  in_references = True
379
  continue
380
  if in_references:
 
391
 
392
 
393
  def generate_and_format(
394
+ input_role,
395
  topic,
396
  keywords,
397
  article_length,
 
406
  conclusion_type,
407
  ai_model,
408
  api_key,
409
+ google_search_check,
410
  year_from,
411
  month_from,
412
  day_from,
413
  year_to,
414
  month_to,
415
  day_to,
416
+ domains_to_include,
417
  generated_article: str = None,
418
  user_comments: str = None,
419
  ):
420
  date_from = build_date(year_from, month_from, day_from)
421
  date_to = build_date(year_to, month_to, day_to)
422
  sorted_date = f"date:r:{date_from}:{date_to}"
423
+ content_string = ""
424
+ if google_search_check:
425
+ url_content = google_search(topic, sorted_date, domains_to_include)
426
+ content_string = "\n".join(
427
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
428
+ )
429
+ content_string = "Use the trusted information here from the URLs I've found for you:\n" + content_string
430
  article = generate_article(
431
+ input_role,
432
  topic,
433
  keywords,
434
  article_length,
 
442
  num_examples,
443
  conclusion_type,
444
  ai_model,
445
+ content_string,
446
  api_key,
 
 
447
  generated_article,
448
  user_comments,
449
  )
 
469
  with gr.Column(scale=2):
470
  with gr.Group():
471
  gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
472
+ input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
473
  input_topic = gr.Textbox(
474
  label="Topic",
475
  placeholder="Enter the main topic of your article",
 
590
  )
591
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
592
  with gr.Group():
593
+ with gr.Row():
594
+ google_search_check = gr.Checkbox(
595
+ label="Enable Google Search For Recent Sources", value=True
596
+ )
597
  with gr.Row():
598
  month_from = gr.Dropdown(
599
  choices=months,
 
615
  year_to = gr.Textbox(label="To Year", value=d1[2])
616
 
617
  with gr.Row():
618
+ domains_to_include = gr.Dropdown(
619
  domain_list,
620
+ value=domain_list,
621
  multiselect=True,
622
+ label="Domains To Include",
623
  )
624
 
625
  with gr.Group():
 
701
  generate_btn.click(
702
  fn=generate_and_format,
703
  inputs=[
704
+ input_role,
705
  input_topic,
706
  input_keywords,
707
  input_length,
 
716
  input_conclusion,
717
  ai_generator,
718
  input_api,
719
+ google_search_check,
720
  year_from,
721
  month_from,
722
  day_from,
723
  year_to,
724
  month_to,
725
  day_to,
726
+ domains_to_include,
727
  ],
728
  outputs=[output_article],
729
  )
 
731
  regenerate_btn.click(
732
  fn=generate_and_format,
733
  inputs=[
734
+ input_role,
735
  input_topic,
736
  input_keywords,
737
  input_length,
 
746
  input_conclusion,
747
  ai_generator,
748
  input_api,
749
+ google_search_check,
750
  year_from,
751
  month_from,
752
  day_from,
753
  year_to,
754
  month_to,
755
  day_to,
756
+ domains_to_include,
757
  output_article,
758
  ai_comments,
759
  ],
plagiarism.py CHANGED
@@ -61,10 +61,18 @@ async def parallel_scrap(urls):
61
  return results
62
 
63
 
 
 
 
 
 
 
 
 
64
  def google_search_urls(
65
  text,
66
  sorted_date,
67
- domains_to_skip,
68
  api_key,
69
  cse_id,
70
  **kwargs,
@@ -75,7 +83,9 @@ def google_search_urls(
75
  if "items" in results and len(results["items"]) > 0:
76
  for count, link in enumerate(results["items"]):
77
  # skip user selected domains
78
- if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
 
 
79
  continue
80
  url = link["link"]
81
  if url not in url_list:
@@ -84,25 +94,24 @@ def google_search_urls(
84
 
85
 
86
  def google_search(
87
- input,
88
  sorted_date,
89
- domains_to_skip,
90
  ):
91
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
92
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
93
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
94
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
95
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
96
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
97
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
98
  cse_id = "851813e81162b4ed4"
99
-
100
  # get list of URLS to check
101
  start_time = time.perf_counter()
102
  url_list = google_search_urls(
103
- input,
104
  sorted_date,
105
- domains_to_skip,
106
  api_key,
107
  cse_id,
108
  )
 
61
  return results
62
 
63
 
64
+ def scrap(urls):
65
+ client = httpx.Client()
66
+ soups = []
67
+ for url in urls:
68
+ soups.append(get_url_data(url=url, client=client))
69
+ return soups
70
+
71
+
72
  def google_search_urls(
73
  text,
74
  sorted_date,
75
+ domains_to_include,
76
  api_key,
77
  cse_id,
78
  **kwargs,
 
83
  if "items" in results and len(results["items"]) > 0:
84
  for count, link in enumerate(results["items"]):
85
  # skip user selected domains
86
+ if (domains_to_include is None) or not any(
87
+ ("." + domain) in link["link"] for domain in domains_to_include
88
+ ):
89
  continue
90
  url = link["link"]
91
  if url not in url_list:
 
94
 
95
 
96
  def google_search(
97
+ topic,
98
  sorted_date,
99
+ domains_to_include,
100
  ):
101
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
102
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
103
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
104
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
105
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
106
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
107
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
108
  cse_id = "851813e81162b4ed4"
 
109
  # get list of URLS to check
110
  start_time = time.perf_counter()
111
  url_list = google_search_urls(
112
+ topic,
113
  sorted_date,
114
+ domains_to_include,
115
  api_key,
116
  cse_id,
117
  )