minko186 commited on
Commit
078999d
·
1 Parent(s): 24a982b

sync with humanize.py from main

Browse files
Files changed (1) hide show
  1. humanize.py +102 -851
humanize.py CHANGED
@@ -1,864 +1,115 @@
1
- """
2
- nohup python3 app.py &
3
- """
4
-
5
- import re
6
- import requests
7
- from typing import Dict
8
- from collections import defaultdict
9
- from datetime import date
10
- import gradio as gr
11
- from scipy.special import softmax
12
- import language_tool_python
13
- import nltk
14
  import torch
15
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
16
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
17
-
18
- from utils import remove_special_characters
19
- from plagiarism import google_search, months, domain_list, build_date
20
- from humanize import paraphrase_text, device
21
- from ai_generate import generate
22
-
23
- print(f"Using device: {device}")
24
-
25
- models = {
26
- "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
27
- "polygraf-ai/bc-roberta-openai-2sent"
28
- ).to(device),
29
- "Polygraf AI (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
30
- "polygraf-ai/bc_combined_3sent"
31
- ).to(device),
32
- }
33
- tokenizers = {
34
- "Polygraf AI (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
35
- "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
 
 
 
 
 
 
36
  }
37
 
38
- # grammar correction tool
39
- tool = language_tool_python.LanguageTool("en-US")
40
-
41
-
42
- # Function to move model to the appropriate device
43
- def to_device(model):
44
- return model.to(device)
45
-
46
-
47
- def copy_to_input(text):
48
- return text
49
-
50
-
51
- def remove_bracketed_numbers(text):
52
- pattern = r"^\[\d+\]"
53
- cleaned_text = re.sub(pattern, "", text)
54
- return cleaned_text
55
-
56
-
57
- def clean_text(text: str) -> str:
58
- paragraphs = text.split("\n\n")
59
- cleaned_paragraphs = []
60
- for paragraph in paragraphs:
61
- cleaned = re.sub(r"\s+", " ", paragraph).strip()
62
- cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
63
- cleaned_paragraphs.append(cleaned)
64
- return "\n".join(cleaned_paragraphs)
65
-
66
-
67
- def split_text_from_refs(text: str, sep="\n"):
68
- lines = text.split("\n")
69
- references = []
70
- article_text = []
71
- index_pattern = re.compile(r"\[(\d+)\]")
72
- in_references = False
73
-
74
- for line in lines:
75
- if line.strip().lower() == "references" or line.strip().lower() == "references:":
76
- in_references = True
77
- continue
78
- if line.strip().lower().startswith("references:"):
79
- in_references = True
80
- if in_references:
81
- matches = index_pattern.split(line)
82
- for match in matches:
83
- if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
84
- references.append(match.strip())
85
- else:
86
- article_text.append(line)
87
-
88
- formatted_refs = []
89
- for i, ref in enumerate(references, 1):
90
- ref = remove_bracketed_numbers(ref)
91
- formatted_refs.append(f"[{i}] {ref}{sep}")
92
-
93
- return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
94
-
95
-
96
- def ends_with_references(text):
97
- # Define a regular expression pattern for variations of "References:"
98
- pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
99
-
100
- # Check if the text ends with any form of "References:"
101
- return bool(pattern.search(text.strip()))
102
-
103
-
104
- def format_and_correct_language_check(text: str) -> str:
105
- return tool.correct(text)
106
-
107
-
108
- def predict(model, tokenizer, text):
109
- text = remove_special_characters(text)
110
- bc_token_size = 256
111
- with torch.no_grad():
112
- model.eval()
113
- tokens = tokenizer(
114
- text,
115
- padding="max_length",
116
- truncation=True,
117
- max_length=bc_token_size,
118
- return_tensors="pt",
119
- ).to(device)
120
- output = model(**tokens)
121
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
122
- output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
123
- return output_norm
124
-
125
-
126
- def ai_generated_test(text, model="BC Original"):
127
- return predict(models[model], tokenizers[model], text)
128
-
129
-
130
- def detection_polygraf(text, model="BC Original"):
131
- # sentences = split_into_sentences(text)
132
- sentences = nltk.sent_tokenize(text)
133
- num_sentences = len(sentences)
134
- scores = defaultdict(list)
135
-
136
- overall_scores = []
137
-
138
- # Process each chunk of 3 sentences and store the score for each sentence in the chunk
139
- for i in range(num_sentences):
140
- chunk = " ".join(sentences[i : i + 3])
141
- if chunk:
142
- # result = classifier(chunk)
143
- result = ai_generated_test(chunk, model)
144
- score = result["AI"]
145
- for j in range(i, min(i + 3, num_sentences)):
146
- scores[j].append(score)
147
-
148
- # Calculate the average score for each sentence and apply color coding
149
- paragraphs = text.split("\n")
150
- paragraphs = [s for s in paragraphs if s.strip()]
151
- colored_paragraphs = []
152
- i = 0
153
- for paragraph in paragraphs:
154
- temp_sentences = nltk.sent_tokenize(paragraph)
155
- colored_sentences = []
156
- for sentence in temp_sentences:
157
- if scores[i]:
158
- avg_score = sum(scores[i]) / len(scores[i])
159
- if avg_score >= 0.65:
160
- colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
161
- else:
162
- colored_sentence = sentence
163
- colored_sentences.append(colored_sentence)
164
- overall_scores.append(avg_score)
165
- i = i + 1
166
- combined_sentences = " ".join(colored_sentences)
167
- colored_paragraphs.append(combined_sentences)
168
-
169
- overall_score = sum(overall_scores) / len(overall_scores)
170
- overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
171
- return overall_score, "<br><br>".join(colored_paragraphs)
172
-
173
-
174
- ai_check_options = [
175
- "Polygraf AI (Base Model)",
176
- "Polygraf AI (Advanced Model)",
177
- ]
178
-
179
-
180
- def ai_generated_test_sapling(text: str) -> Dict:
181
- response = requests.post(
182
- "https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
183
- )
184
- return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
185
-
186
-
187
- class GPT2PPL:
188
- def __init__(self):
189
- self.device = device
190
- self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
191
- self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
192
-
193
- def __call__(self, text):
194
- encodings = self.tokenizer(text, return_tensors="pt")
195
- encodings = {k: v.to(self.device) for k, v in encodings.items()}
196
- max_length = self.model.config.n_positions
197
- stride = 512
198
- seq_len = encodings.input_ids.size(1)
199
-
200
- nlls = []
201
- for i in range(0, seq_len, stride):
202
- begin_loc = max(i + stride - max_length, 0)
203
- end_loc = min(i + stride, seq_len)
204
- trg_len = end_loc - i
205
- input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
206
- target_ids = input_ids.clone()
207
- target_ids[:, :-trg_len] = -100
208
-
209
- with torch.no_grad():
210
- outputs = self.model(input_ids, labels=target_ids)
211
- neg_log_likelihood = outputs.loss * trg_len
212
-
213
- nlls.append(neg_log_likelihood)
214
-
215
- ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
216
- return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
217
-
218
-
219
- def ai_generated_test_gptzero(text):
220
- gptzero_model = GPT2PPL()
221
- result = gptzero_model(text)
222
- return result, None
223
-
224
-
225
- def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
226
- body, references = split_text_from_refs(text, "<br>")
227
- score, text = detection_polygraf(text=body, model=model)
228
- text = text + "<br>" + references
229
- return score, text
230
-
231
-
232
- def ai_check(text: str, option: str):
233
- if option.startswith("Polygraf AI"):
234
- return highlighter_polygraf(text, option)
235
- elif option == "Sapling AI":
236
- return ai_generated_test_sapling(text)
237
- elif option == "GPTZero":
238
- return ai_generated_test_gptzero(text)
239
  else:
240
- return highlighter_polygraf(text, option)
241
-
242
-
243
- def generate_prompt(settings: Dict[str, str]) -> str:
244
- prompt = f"""
245
- I am a {settings['role']}
246
- Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
247
-
248
- Style and Tone:
249
- - Writing style: {settings['writing_style']}
250
- - Tone: {settings['tone']}
251
- - Target audience: {settings['user_category']}
252
-
253
- Content:
254
- - Depth: {settings['depth_of_content']}
255
- - Structure: {', '.join(settings['structure'])}
256
-
257
- Keywords to incorporate:
258
- {', '.join(settings['keywords'])}
259
-
260
- Additional requirements:
261
- - Don't start with "Here is a...", start with the requested text directly
262
- - Include {settings['num_examples']} relevant examples or case studies
263
- - Incorporate data or statistics from {', '.join(settings['references'])}
264
- - End with a {settings['conclusion_type']} conclusion
265
- - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
266
- - Do not make any headline, title bold.
267
- {settings['sources']}
268
-
269
- Ensure proper paragraph breaks for better readability.
270
- Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
271
- """
272
- return prompt
273
-
274
-
275
- def regenerate_prompt(settings: Dict[str, str]) -> str:
276
- prompt = f"""
277
- I am a {settings['role']}
278
- "{settings['generated_article']}"
279
- Edit the given text based on user comments.
280
-
281
- Comments:
282
- - Don't start with "Here is a...", start with the requested text directly
283
- - {settings['user_comments']}
284
- - The original content should not be changed. Make minor modifications based on user comments above.
285
- - Keep the references the same as the given text in the same format.
286
- - Do not make any headline, title bold.
287
- {settings['sources']}
288
-
289
- Ensure proper paragraph breaks for better readability.
290
- Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
291
- """
292
- return prompt
293
-
294
-
295
- def generate_article(
296
- input_role: str,
297
- topic: str,
298
- keywords: str,
299
- article_length: str,
300
- format: str,
301
- writing_style: str,
302
- tone: str,
303
- user_category: str,
304
- depth_of_content: str,
305
- structure: str,
306
- references: str,
307
- num_examples: str,
308
- conclusion_type: str,
309
- ai_model: str,
310
- content_string: str,
311
- # api_key: str = None,
312
- pdf_file_input=None,
313
- generated_article: str = None,
314
- user_comments: str = None,
315
- ) -> str:
316
- settings = {
317
- "role": input_role,
318
- "topic": topic,
319
- "keywords": [k.strip() for k in keywords.split(",")],
320
- "article_length": article_length,
321
- "format": format,
322
- "writing_style": writing_style,
323
- "tone": tone,
324
- "user_category": user_category,
325
- "depth_of_content": depth_of_content,
326
- "structure": [s.strip() for s in structure.split(",")],
327
- "references": [r.strip() for r in references.split(",")],
328
- "num_examples": num_examples,
329
- "conclusion_type": conclusion_type,
330
- "sources": content_string,
331
- "generated_article": generated_article,
332
- "user_comments": user_comments,
333
- }
334
-
335
- if generated_article:
336
- prompt = regenerate_prompt(settings)
337
- else:
338
- prompt = generate_prompt(settings)
339
-
340
- print("Generated Prompt...\n", prompt)
341
- article = generate(
342
- prompt,
343
- ai_model,
344
- pdf_file_input, # api_key
345
- )
346
-
347
- return clean_text(article)
348
-
349
-
350
- def humanize(
351
- text: str,
352
- model: str,
353
- temperature: float = 1.2,
354
- repetition_penalty: float = 1,
355
- top_k: int = 50,
356
- length_penalty: float = 1,
357
- ) -> str:
358
- body, references = split_text_from_refs(text)
359
- result = paraphrase_text(
360
- text=body,
361
- model_name=model,
362
  temperature=temperature,
363
  repetition_penalty=repetition_penalty,
 
364
  top_k=top_k,
365
  length_penalty=length_penalty,
366
  )
367
- result = result + "\n\n" + references
368
- return format_and_correct_language_check(result)
369
-
370
-
371
- def update_visibility_api(model: str):
372
- if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
373
- return gr.update(visible=True)
374
- else:
375
- return gr.update(visible=False)
376
-
377
-
378
- def format_references(text: str) -> str:
379
- lines = text.split("\n")
380
- references = []
381
- article_text = []
382
- index_pattern = re.compile(r"\[(\d+)\]")
383
- in_references = False
384
-
385
- for line in lines:
386
- if line.strip().lower() == "references" or line.strip().lower() == "references:":
387
- in_references = True
388
- continue
389
- if line.strip().lower().startswith("references:"):
390
- in_references = True
391
- if in_references:
392
- matches = index_pattern.split(line)
393
- for match in matches:
394
- if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
395
- references.append(match.strip())
396
- else:
397
- article_text.append(line)
398
-
399
- formatted_refs = []
400
- for i, ref in enumerate(references, 1):
401
- ref = remove_bracketed_numbers(ref)
402
- formatted_refs.append(f"[{i}] {ref}\n")
403
-
404
- return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
405
-
406
-
407
- def generate_and_format(
408
- input_role,
409
- topic,
410
- keywords,
411
- article_length,
412
- format,
413
- writing_style,
414
- tone,
415
- user_category,
416
- depth_of_content,
417
- structure,
418
- references,
419
- num_examples,
420
- conclusion_type,
421
- ai_model,
422
- # api_key,
423
- google_search_check,
424
- year_from,
425
- month_from,
426
- day_from,
427
- year_to,
428
- month_to,
429
- day_to,
430
- domains_to_include,
431
- include_sites,
432
- exclude_sites,
433
- pdf_file_input,
434
- generated_article: str = None,
435
- user_comments: str = None,
436
  ):
437
- content_string = ""
438
- url_content = None
439
- if google_search_check:
440
- date_from = build_date(year_from, month_from, day_from)
441
- date_to = build_date(year_to, month_to, day_to)
442
- sorted_date = f"date:r:{date_from}:{date_to}"
443
- final_query = topic
444
- if include_sites:
445
- site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
446
- final_query += " " + " OR ".join(site_queries)
447
- if exclude_sites:
448
- exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
449
- final_query += " " + " ".join(exclude_queries)
450
- print(f"Google Search Query: {final_query}")
451
- url_content = google_search(final_query, sorted_date, domains_to_include)
452
- content_string = "\n".join(
453
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
454
- )
455
- content_string = (
456
- "Use the trusted information here from the URLs and add them as References:\n" + content_string
457
- )
458
- article = generate_article(
459
- input_role,
460
- topic,
461
- keywords,
462
- article_length,
463
- format,
464
- writing_style,
465
- tone,
466
- user_category,
467
- depth_of_content,
468
- structure,
469
- references,
470
- num_examples,
471
- conclusion_type,
472
- ai_model,
473
- content_string,
474
- # api_key,
475
- pdf_file_input,
476
- generated_article,
477
- user_comments,
478
- )
479
- if ends_with_references(article) and url_content is not None:
480
- for url in url_content.keys():
481
- article += f"\n{url}"
482
-
483
- return format_references(article)
484
-
485
-
486
- def create_interface():
487
- with gr.Blocks(
488
- theme=gr.themes.Default(
489
- primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
490
- ),
491
- css="""
492
- .input-highlight-pink block_label {background-color: #008080}
493
- """,
494
- ) as demo:
495
- today = date.today()
496
- # dd/mm/YY
497
- d1 = today.strftime("%d/%B/%Y")
498
- d1 = d1.split("/")
499
- gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
500
-
501
- with gr.Row():
502
- with gr.Column(scale=2):
503
- with gr.Group():
504
- gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
505
- input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
506
- input_topic = gr.Textbox(
507
- label="Topic",
508
- placeholder="Enter the main topic of your article",
509
- elem_classes="input-highlight-pink",
510
- )
511
- input_keywords = gr.Textbox(
512
- label="Keywords",
513
- placeholder="Enter comma-separated keywords",
514
- elem_classes="input-highlight-yellow",
515
- )
516
-
517
- with gr.Row():
518
- input_format = gr.Dropdown(
519
- choices=[
520
- "Article",
521
- "Essay",
522
- "Blog post",
523
- "Report",
524
- "Research paper",
525
- "News article",
526
- "White paper",
527
- "LinkedIn post",
528
- "X (Twitter) post",
529
- "Instagram Video Content",
530
- "TikTok Video Content",
531
- "Facebook post",
532
- ],
533
- value="Article",
534
- label="Format",
535
- elem_classes="input-highlight-turquoise",
536
- )
537
-
538
- input_length = gr.Slider(
539
- minimum=50,
540
- maximum=5000,
541
- step=50,
542
- value=300,
543
- label="Article Length",
544
- elem_classes="input-highlight-pink",
545
- )
546
-
547
- with gr.Row():
548
- input_writing_style = gr.Dropdown(
549
- choices=[
550
- "Formal",
551
- "Informal",
552
- "Technical",
553
- "Conversational",
554
- "Journalistic",
555
- "Academic",
556
- "Creative",
557
- ],
558
- value="Formal",
559
- label="Writing Style",
560
- elem_classes="input-highlight-yellow",
561
- )
562
- input_tone = gr.Dropdown(
563
- choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
564
- value="Professional",
565
- label="Tone",
566
- elem_classes="input-highlight-turquoise",
567
- )
568
-
569
- input_user_category = gr.Dropdown(
570
- choices=[
571
- "Students",
572
- "Professionals",
573
- "Researchers",
574
- "General Public",
575
- "Policymakers",
576
- "Entrepreneurs",
577
- ],
578
- value="General Public",
579
- label="Target Audience",
580
- elem_classes="input-highlight-pink",
581
- )
582
- input_depth = gr.Dropdown(
583
- choices=[
584
- "Surface-level overview",
585
- "Moderate analysis",
586
- "In-depth research",
587
- "Comprehensive study",
588
- ],
589
- value="Moderate analysis",
590
- label="Depth of Content",
591
- elem_classes="input-highlight-yellow",
592
- )
593
- input_structure = gr.Dropdown(
594
- choices=[
595
- "Introduction, Body, Conclusion",
596
- "Abstract, Introduction, Methods, Results, Discussion, Conclusion",
597
- "Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
598
- "Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
599
- ],
600
- value="Introduction, Body, Conclusion",
601
- label="Structure",
602
- elem_classes="input-highlight-turquoise",
603
- )
604
- input_references = gr.Dropdown(
605
- choices=[
606
- "Academic journals",
607
- "Industry reports",
608
- "Government publications",
609
- "News outlets",
610
- "Expert interviews",
611
- "Case studies",
612
- ],
613
- value="News outlets",
614
- label="References",
615
- elem_classes="input-highlight-pink",
616
- )
617
- input_num_examples = gr.Dropdown(
618
- choices=["1-2", "3-4", "5+"],
619
- value="1-2",
620
- label="Number of Examples/Case Studies",
621
- elem_classes="input-highlight-yellow",
622
- )
623
- input_conclusion = gr.Dropdown(
624
- choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
625
- value="Call to Action",
626
- label="Conclusion Type",
627
- elem_classes="input-highlight-turquoise",
628
- )
629
- gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
630
- with gr.Row():
631
- google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
632
- with gr.Group(visible=True) as search_options:
633
- with gr.Row():
634
- include_sites = gr.Textbox(
635
- label="Include Specific Websites",
636
- placeholder="Enter comma-separated keywords",
637
- elem_classes="input-highlight-yellow",
638
- )
639
- with gr.Row():
640
- exclude_sites = gr.Textbox(
641
- label="Exclude Specific Websites",
642
- placeholder="Enter comma-separated keywords",
643
- elem_classes="input-highlight-yellow",
644
- )
645
- with gr.Row():
646
- domains_to_include = gr.Dropdown(
647
- domain_list,
648
- value=domain_list,
649
- multiselect=True,
650
- label="Domains To Include",
651
- )
652
- with gr.Row():
653
- month_from = gr.Dropdown(
654
- choices=months,
655
- label="From Month",
656
- value="January",
657
- interactive=True,
658
- )
659
- day_from = gr.Textbox(label="From Day", value="01")
660
- year_from = gr.Textbox(label="From Year", value="2000")
661
-
662
- with gr.Row():
663
- month_to = gr.Dropdown(
664
- choices=months,
665
- label="To Month",
666
- value=d1[1],
667
- interactive=True,
668
- )
669
- day_to = gr.Textbox(label="To Day", value=d1[0])
670
- year_to = gr.Textbox(label="To Year", value=d1[2])
671
-
672
- gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
673
- pdf_file_input = gr.File(label="Upload PDF")
674
-
675
- with gr.Group():
676
- gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
677
- ai_generator = gr.Dropdown(
678
- choices=[
679
- "OpenAI GPT 4",
680
- "OpenAI GPT 4o",
681
- "OpenAI GPT 4o Mini",
682
- "Claude Sonnet 3.5",
683
- "Gemini 1.5 Pro",
684
- "LLaMA 3",
685
- ],
686
- value="OpenAI GPT 4o Mini",
687
- label="AI Model",
688
- elem_classes="input-highlight-pink",
689
- )
690
- # input_api = gr.Textbox(label="API Key", visible=False)
691
- # ai_generator.change(update_visibility_api, ai_generator, input_api)
692
-
693
- generate_btn = gr.Button("Generate Article", variant="primary")
694
-
695
- with gr.Accordion("Advanced Humanizer Settings", open=False):
696
- with gr.Row():
697
- model_dropdown = gr.Radio(
698
- choices=[
699
- "Base Model",
700
- "Large Model",
701
- "XL Model",
702
- # "XL Law Model",
703
- # "XL Marketing Model",
704
- # "XL Child Style Model",
705
- ],
706
- value="Large Model",
707
- label="Humanizer Model Version",
708
- )
709
- with gr.Row():
710
- temperature_slider = gr.Slider(
711
- minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature"
712
- )
713
- top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k")
714
- with gr.Row():
715
- repetition_penalty_slider = gr.Slider(
716
- minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
717
- )
718
- length_penalty_slider = gr.Slider(
719
- minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
720
- )
721
-
722
- with gr.Column(scale=3):
723
- output_article = gr.Textbox(label="Generated Article", lines=20)
724
- ai_comments = gr.Textbox(
725
- label="Add comments to help edit generated text", interactive=True, visible=False
726
- )
727
- regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
728
- ai_detector_dropdown = gr.Radio(
729
- choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
730
- )
731
- ai_check_btn = gr.Button("AI Check")
732
-
733
- with gr.Accordion("AI Detection Results", open=True):
734
- ai_check_result = gr.Label(label="AI Check Result")
735
- highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
736
- humanize_btn = gr.Button("Humanize")
737
- # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
738
- humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
739
- copy_to_input_btn = gr.Button("Copy to Input for AI Check")
740
-
741
- def regenerate_visible(text):
742
- if text:
743
- return gr.update(visible=True)
744
- else:
745
- return gr.update(visible=False)
746
-
747
- def highlight_visible(text):
748
- if text.startswith("Polygraf"):
749
- return gr.update(visible=True)
750
- else:
751
- return gr.update(visible=False)
752
-
753
- def search_visible(toggle):
754
- if toggle:
755
- return gr.update(visible=True)
756
- else:
757
- return gr.update(visible=False)
758
-
759
- google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
760
- ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
761
- output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
762
- ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
763
- ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
764
-
765
- generate_btn.click(
766
- fn=generate_and_format,
767
- inputs=[
768
- input_role,
769
- input_topic,
770
- input_keywords,
771
- input_length,
772
- input_format,
773
- input_writing_style,
774
- input_tone,
775
- input_user_category,
776
- input_depth,
777
- input_structure,
778
- input_references,
779
- input_num_examples,
780
- input_conclusion,
781
- ai_generator,
782
- # input_api,
783
- google_search_check,
784
- year_from,
785
- month_from,
786
- day_from,
787
- year_to,
788
- month_to,
789
- day_to,
790
- domains_to_include,
791
- include_sites,
792
- exclude_sites,
793
- pdf_file_input,
794
- ],
795
- outputs=[output_article],
796
- )
797
-
798
- regenerate_btn.click(
799
- fn=generate_and_format,
800
- inputs=[
801
- input_role,
802
- input_topic,
803
- input_keywords,
804
- input_length,
805
- input_format,
806
- input_writing_style,
807
- input_tone,
808
- input_user_category,
809
- input_depth,
810
- input_structure,
811
- input_references,
812
- input_num_examples,
813
- input_conclusion,
814
- ai_generator,
815
- # input_api,
816
- google_search_check,
817
- year_from,
818
- month_from,
819
- day_from,
820
- year_to,
821
- month_to,
822
- day_to,
823
- domains_to_include,
824
- pdf_file_input,
825
- output_article,
826
- include_sites,
827
- exclude_sites,
828
- ai_comments,
829
- ],
830
- outputs=[output_article],
831
- )
832
-
833
- ai_check_btn.click(
834
- fn=ai_check,
835
- inputs=[output_article, ai_detector_dropdown],
836
- outputs=[ai_check_result, highlighted_text],
837
- )
838
 
839
- humanize_btn.click(
840
- fn=humanize,
841
- inputs=[
842
- output_article,
843
- model_dropdown,
844
- temperature_slider,
845
- repetition_penalty_slider,
846
- top_k_slider,
847
- length_penalty_slider,
848
- ],
849
- outputs=[humanized_output],
850
- )
851
 
852
- copy_to_input_btn.click(
853
- fn=copy_to_input,
854
- inputs=[humanized_output],
855
- outputs=[output_article],
 
 
 
 
 
 
 
856
  )
857
-
858
- return demo
859
-
860
-
861
- if __name__ == "__main__":
862
- demo = create_interface()
863
- # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
864
- demo.launch(server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
1
+ import gc
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
+ from nltk import sent_tokenize
4
+ import nltk
5
+ from tqdm import tqdm
6
+ import gradio as gr
7
+ from peft import PeftModel
8
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
9
+
10
+ nltk.download("punkt")
11
+ # autodetect the available device
12
+ GPU_IDX = 1 # which GPU to use
13
+ if torch.cuda.is_available():
14
+ num_gpus = torch.cuda.device_count()
15
+ print(f"Number of available GPUs: {num_gpus}")
16
+ assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
17
+ device = torch.device(f"cuda:{GPU_IDX}")
18
+ print(f"Using GPU: {GPU_IDX}")
19
+ else:
20
+ print("CUDA is not available. Using CPU instead.")
21
+ device = torch.device("cpu")
22
+
23
+ batch_size = 64
24
+
25
+ # Configuration for models and their adapters
26
+ model_config = {
27
+ "Base Model": "polygraf-ai/poly-humanizer-base",
28
+ "Large Model": "polygraf-ai/poly-humanizer-large",
29
+ "XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
30
  }
31
 
32
+ # cache the base models, tokenizers, and adapters
33
+ # initialize model and tokenizer
34
+ models, tokenizers = {}, {}
35
+ for name, path in model_config.items():
36
+ if name == "XL Model":
37
+ model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
38
+ model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
39
+ model = model.merge_and_unload()
40
+ models[name] = model
41
+ tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
+ model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
44
+ models[name] = model
45
+ tokenizers[name] = T5Tokenizer.from_pretrained(path)
46
+ print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")
47
+
48
+
49
+ def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
50
+ inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
51
+ inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
52
+ outputs = model.generate(
53
+ **inputs,
54
+ do_sample=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  temperature=temperature,
56
  repetition_penalty=repetition_penalty,
57
+ max_length=128,
58
  top_k=top_k,
59
  length_penalty=length_penalty,
60
  )
61
+ answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
62
+ return answers
63
+
64
+
65
+ def paraphrase_text(
66
+ text,
67
+ progress=gr.Progress(),
68
+ model_name="Base Model",
69
+ temperature=1.2,
70
+ repetition_penalty=1.0,
71
+ top_k=50,
72
+ length_penalty=1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  ):
74
+ """
75
+ Optimization here is to feed all sentences at once to the model.
76
+ Paragraphs are stored as a number of sentences per paragraph.
77
+ """
78
+ progress(0, desc="Starting to Humanize")
79
+ # Select the model, tokenizer, and adapter
80
+ tokenizer = tokenizers[model_name]
81
+ model = models[model_name].to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Split the text into paragraphs and then into sentences
84
+ paragraphs = text.split("\n")
85
+ all_sentences = []
86
+ sentences_per_paragraph = []
 
 
 
 
 
 
 
 
87
 
88
+ for paragraph in paragraphs:
89
+ sentences = sent_tokenize(paragraph)
90
+ sentences_per_paragraph.append(len(sentences))
91
+ all_sentences.extend(sentences)
92
+
93
+ # Process all sentences in batches
94
+ paraphrased_sentences = []
95
+ for i in progress.tqdm(range(0, len(all_sentences), batch_size)):
96
+ batch_sentences = all_sentences[i : i + batch_size]
97
+ paraphrased_batch = paraphrase_sentences(
98
+ model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
99
  )
100
+ paraphrased_sentences.extend(paraphrased_batch)
101
+
102
+ # Clear memory
103
+ torch.cuda.empty_cache()
104
+ gc.collect()
105
+
106
+ # Reconstruct paragraphs
107
+ humanized_paragraphs = []
108
+ sentence_index = 0
109
+ for num_sentences in sentences_per_paragraph:
110
+ humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
111
+ humanized_paragraphs.append(humanized_paragraph)
112
+ sentence_index += num_sentences
113
+
114
+ humanized_text = "\n".join(humanized_paragraphs)
115
+ return humanized_text