minko186 commited on
Commit
89644d7
·
1 Parent(s): c180684

pushed minko branch

Browse files
Files changed (1) hide show
  1. app.py +80 -39
app.py CHANGED
@@ -13,33 +13,41 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
13
  from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
- from utils import remove_special_characters
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
 
22
  models = {
23
- 'Polygraf AI Watson (Base Model)': AutoModelForSequenceClassification.from_pretrained('polygraf-ai/bc-roberta-openai-2sent').to(device),
24
- 'Polygraf AI Sherlock (Advanced Model)': AutoModelForSequenceClassification.from_pretrained('polygraf-ai/bc_combined_3sent').to(device),
 
 
 
 
25
  }
26
  tokenizers = {
27
- 'Polygraf AI Watson (Base Model)': AutoTokenizer.from_pretrained('polygraf-ai/bc-roberta-openai-2sent'),
28
- 'Polygraf AI Sherlock (Advanced Model)': AutoTokenizer.from_pretrained('polygraf-ai/bc_combined_3sent'),
29
  }
30
 
 
31
  # Function to move model to the appropriate device
32
  def to_device(model):
33
  return model.to(device)
34
 
 
35
  def copy_to_input(text):
36
  return text
37
 
 
38
  def remove_bracketed_numbers(text):
39
  pattern = r"^\[\d+\]"
40
  cleaned_text = re.sub(pattern, "", text)
41
  return cleaned_text
42
 
 
43
  def clean_text(text: str) -> str:
44
  paragraphs = text.split("\n\n")
45
  cleaned_paragraphs = []
@@ -49,6 +57,7 @@ def clean_text(text: str) -> str:
49
  cleaned_paragraphs.append(cleaned)
50
  return "\n".join(cleaned_paragraphs)
51
 
 
52
  def format_and_correct(text: str) -> str:
53
  prompt = f"""
54
  Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
@@ -57,6 +66,7 @@ def format_and_correct(text: str) -> str:
57
  corrected_text = generate(prompt, "Groq", None)
58
  return clean_text(corrected_text)
59
 
 
60
  def format_and_correct_para(text: str) -> str:
61
  paragraphs = text.split("\n")
62
  corrected_paragraphs = []
@@ -66,6 +76,7 @@ def format_and_correct_para(text: str) -> str:
66
  corrected_text = "\n\n".join(corrected_paragraphs)
67
  return corrected_text
68
 
 
69
  def format_and_correct_language_check(text: str) -> str:
70
  tool = language_tool_python.LanguageTool("en-US")
71
  return tool.correct(text)
@@ -86,60 +97,79 @@ def predict(model, tokenizer, text):
86
  output = model(**tokens)
87
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
88
  output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
89
- return output_norm
90
 
91
- def ai_generated_test(text, model='BC Original'):
 
92
  return predict(models[model], tokenizers[model], text)
93
 
94
- def process_text(text, model='BC Original'):
 
 
95
  sentences = nltk.sent_tokenize(text)
96
  num_sentences = len(sentences)
97
  scores = defaultdict(list)
 
98
  overall_scores = []
99
-
 
100
  for i in range(num_sentences):
101
- chunk = ' '.join(sentences[i:i+3])
102
- if chunk:
 
103
  result = ai_generated_test(chunk, model)
104
- score = result['AI']
105
- for j in range(i, min(i+3, num_sentences)):
106
  scores[j].append(score)
107
 
108
- colored_sentences = []
109
- for i, sentence in enumerate(sentences):
110
- if scores[i]:
111
- avg_score = sum(scores[i]) / len(scores[i])
112
- if avg_score >= 0.65:
113
- colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
114
- else:
115
- colored_sentence = sentence
116
- colored_sentences.append(colored_sentence)
117
- overall_scores.append(avg_score)
118
-
 
 
 
 
 
 
 
 
 
 
 
119
  overall_score = sum(overall_scores) / len(overall_scores)
120
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
121
- return overall_score, " ".join(colored_sentences)
 
122
 
123
  ai_check_options = [
124
  "Polygraf AI Watson (Base Model)",
125
  "Polygraf AI Sherlock (Advanced Model)",
126
  ]
127
 
 
128
  def ai_generated_test_sapling(text: str) -> Dict:
129
  response = requests.post(
130
- "https://api.sapling.ai/api/v1/aidetect",
131
- json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
132
  )
133
  return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
134
 
 
135
  class GPT2PPL:
136
  def __init__(self):
137
  self.device = device
138
- self.model = to_device(GPT2LMHeadModel.from_pretrained('gpt2'))
139
- self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
140
 
141
  def __call__(self, text):
142
- encodings = self.tokenizer(text, return_tensors='pt')
143
  encodings = {k: v.to(self.device) for k, v in encodings.items()}
144
  max_length = self.model.config.n_positions
145
  stride = 512
@@ -163,15 +193,18 @@ class GPT2PPL:
163
  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
164
  return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
165
 
 
166
  def ai_generated_test_gptzero(text):
167
  gptzero_model = GPT2PPL()
168
  result = gptzero_model(text)
169
  print(result)
170
  return result, None
171
 
 
172
  def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
173
  return process_text(text=text, model=model)
174
 
 
175
  def ai_check(text: str, option: str):
176
  if option.startswith("Polygraf AI"):
177
  return highlighter_polygraf(text, option)
@@ -211,6 +244,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
211
  """
212
  return prompt
213
 
 
214
  def regenerate_prompt(settings: Dict[str, str]) -> str:
215
  prompt = f"""
216
  "{settings['generated_article']}"
@@ -228,6 +262,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
228
  """
229
  return prompt
230
 
 
231
  def generate_article(
232
  topic: str,
233
  keywords: str,
@@ -290,6 +325,7 @@ def generate_article(
290
 
291
  return clean_text(article)
292
 
 
293
  def humanize(
294
  text: str,
295
  model: str,
@@ -308,12 +344,14 @@ def humanize(
308
  )
309
  return format_and_correct_language_check(result)
310
 
 
311
  def update_visibility_api(model: str):
312
  if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
313
  return gr.update(visible=True)
314
  else:
315
  return gr.update(visible=False)
316
 
 
317
  def format_references(text: str) -> str:
318
  lines = text.split("\n")
319
  references = []
@@ -336,6 +374,7 @@ def format_references(text: str) -> str:
336
 
337
  return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
338
 
 
339
  def generate_and_format(
340
  topic,
341
  keywords,
@@ -374,6 +413,7 @@ def generate_and_format(
374
  )
375
  return format_references(article)
376
 
 
377
  def create_interface():
378
  with gr.Blocks(
379
  theme=gr.themes.Default(
@@ -422,7 +462,7 @@ def create_interface():
422
  step=50,
423
  value=1000,
424
  label="Article Length",
425
- elem_classes="input-highlight-pink"
426
  )
427
 
428
  with gr.Row():
@@ -554,14 +594,14 @@ def create_interface():
554
  label="Add comments to help edit generated text", interactive=True, visible=False
555
  )
556
  regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
557
- with gr.Row():
558
- with gr.Column():
559
- ai_detector_dropdown = gr.Radio(
560
- choices=ai_check_options, label="Select AI Detector", value="Polygraf AI Watson (Base Model)"
561
- )
562
- ai_check_btn = gr.Button("AI Check")
563
  ai_check_result = gr.Label(label="AI Check Result")
564
- highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
565
  humanize_btn = gr.Button("Humanize")
566
  # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
567
  humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
@@ -582,6 +622,7 @@ def create_interface():
582
  ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
583
  output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
584
  ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
 
585
 
586
  generate_btn.click(
587
  fn=generate_and_format,
@@ -658,4 +699,4 @@ def create_interface():
658
  if __name__ == "__main__":
659
  demo = create_interface()
660
  # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
661
- demo.launch(server_name="0.0.0.0")
 
13
  from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
+ from utils import remove_special_characters
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
 
22
  models = {
23
+ "Polygraf AI Watson (Base Model)": AutoModelForSequenceClassification.from_pretrained(
24
+ "polygraf-ai/bc-roberta-openai-2sent"
25
+ ).to(device),
26
+ "Polygraf AI Sherlock (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
27
+ "polygraf-ai/bc_combined_3sent"
28
+ ).to(device),
29
  }
30
  tokenizers = {
31
+ "Polygraf AI Watson (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
32
+ "Polygraf AI Sherlock (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
33
  }
34
 
35
+
36
  # Function to move model to the appropriate device
37
  def to_device(model):
38
  return model.to(device)
39
 
40
+
41
  def copy_to_input(text):
42
  return text
43
 
44
+
45
  def remove_bracketed_numbers(text):
46
  pattern = r"^\[\d+\]"
47
  cleaned_text = re.sub(pattern, "", text)
48
  return cleaned_text
49
 
50
+
51
  def clean_text(text: str) -> str:
52
  paragraphs = text.split("\n\n")
53
  cleaned_paragraphs = []
 
57
  cleaned_paragraphs.append(cleaned)
58
  return "\n".join(cleaned_paragraphs)
59
 
60
+
61
  def format_and_correct(text: str) -> str:
62
  prompt = f"""
63
  Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
 
66
  corrected_text = generate(prompt, "Groq", None)
67
  return clean_text(corrected_text)
68
 
69
+
70
  def format_and_correct_para(text: str) -> str:
71
  paragraphs = text.split("\n")
72
  corrected_paragraphs = []
 
76
  corrected_text = "\n\n".join(corrected_paragraphs)
77
  return corrected_text
78
 
79
+
80
  def format_and_correct_language_check(text: str) -> str:
81
  tool = language_tool_python.LanguageTool("en-US")
82
  return tool.correct(text)
 
97
  output = model(**tokens)
98
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
99
  output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
100
+ return output_norm
101
 
102
+
103
+ def ai_generated_test(text, model="BC Original"):
104
  return predict(models[model], tokenizers[model], text)
105
 
106
+
107
+ def process_text(text, model="BC Original"):
108
+ # sentences = split_into_sentences(text)
109
  sentences = nltk.sent_tokenize(text)
110
  num_sentences = len(sentences)
111
  scores = defaultdict(list)
112
+
113
  overall_scores = []
114
+
115
+ # Process each chunk of 3 sentences and store the score for each sentence in the chunk
116
  for i in range(num_sentences):
117
+ chunk = " ".join(sentences[i : i + 3])
118
+ if chunk:
119
+ # result = classifier(chunk)
120
  result = ai_generated_test(chunk, model)
121
+ score = result["AI"]
122
+ for j in range(i, min(i + 3, num_sentences)):
123
  scores[j].append(score)
124
 
125
+ # Calculate the average score for each sentence and apply color coding
126
+ paragraphs = text.split("\n")
127
+ paragraphs = [s for s in paragraphs if s.strip()]
128
+ colored_paragraphs = []
129
+ i = 0
130
+ for paragraph in paragraphs:
131
+ temp_sentences = nltk.sent_tokenize(paragraph)
132
+ colored_sentences = []
133
+ for sentence in temp_sentences:
134
+ if scores[i]:
135
+ avg_score = sum(scores[i]) / len(scores[i])
136
+ if avg_score >= 0.65:
137
+ colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
138
+ else:
139
+ colored_sentence = sentence
140
+ colored_sentences.append(colored_sentence)
141
+ overall_scores.append(avg_score)
142
+ i = i + 1
143
+ combined_sentences = " ".join(colored_sentences)
144
+ print(combined_sentences)
145
+ colored_paragraphs.append(combined_sentences)
146
+
147
  overall_score = sum(overall_scores) / len(overall_scores)
148
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
149
+ return overall_score, format_references("<br><br>".join(colored_paragraphs))
150
+
151
 
152
  ai_check_options = [
153
  "Polygraf AI Watson (Base Model)",
154
  "Polygraf AI Sherlock (Advanced Model)",
155
  ]
156
 
157
+
158
  def ai_generated_test_sapling(text: str) -> Dict:
159
  response = requests.post(
160
+ "https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
 
161
  )
162
  return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
163
 
164
+
165
  class GPT2PPL:
166
  def __init__(self):
167
  self.device = device
168
+ self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
169
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
170
 
171
  def __call__(self, text):
172
+ encodings = self.tokenizer(text, return_tensors="pt")
173
  encodings = {k: v.to(self.device) for k, v in encodings.items()}
174
  max_length = self.model.config.n_positions
175
  stride = 512
 
193
  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
194
  return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
195
 
196
+
197
  def ai_generated_test_gptzero(text):
198
  gptzero_model = GPT2PPL()
199
  result = gptzero_model(text)
200
  print(result)
201
  return result, None
202
 
203
+
204
  def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
205
  return process_text(text=text, model=model)
206
 
207
+
208
  def ai_check(text: str, option: str):
209
  if option.startswith("Polygraf AI"):
210
  return highlighter_polygraf(text, option)
 
244
  """
245
  return prompt
246
 
247
+
248
  def regenerate_prompt(settings: Dict[str, str]) -> str:
249
  prompt = f"""
250
  "{settings['generated_article']}"
 
262
  """
263
  return prompt
264
 
265
+
266
  def generate_article(
267
  topic: str,
268
  keywords: str,
 
325
 
326
  return clean_text(article)
327
 
328
+
329
  def humanize(
330
  text: str,
331
  model: str,
 
344
  )
345
  return format_and_correct_language_check(result)
346
 
347
+
348
  def update_visibility_api(model: str):
349
  if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
350
  return gr.update(visible=True)
351
  else:
352
  return gr.update(visible=False)
353
 
354
+
355
  def format_references(text: str) -> str:
356
  lines = text.split("\n")
357
  references = []
 
374
 
375
  return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
376
 
377
+
378
  def generate_and_format(
379
  topic,
380
  keywords,
 
413
  )
414
  return format_references(article)
415
 
416
+
417
  def create_interface():
418
  with gr.Blocks(
419
  theme=gr.themes.Default(
 
462
  step=50,
463
  value=1000,
464
  label="Article Length",
465
+ elem_classes="input-highlight-pink",
466
  )
467
 
468
  with gr.Row():
 
594
  label="Add comments to help edit generated text", interactive=True, visible=False
595
  )
596
  regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
597
+ ai_detector_dropdown = gr.Radio(
598
+ choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
599
+ )
600
+ ai_check_btn = gr.Button("AI Check")
601
+
602
+ with gr.Accordion("AI Detection Results", open=True):
603
  ai_check_result = gr.Label(label="AI Check Result")
604
+ highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
605
  humanize_btn = gr.Button("Humanize")
606
  # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
607
  humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
 
622
  ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
623
  output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
624
  ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
625
+ ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
626
 
627
  generate_btn.click(
628
  fn=generate_and_format,
 
699
  if __name__ == "__main__":
700
  demo = create_interface()
701
  # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
702
+ demo.launch(server_name="0.0.0.0")