Spaces:
Runtime error
Runtime error
sync with humanize.py from main
Browse files- humanize.py +102 -851
humanize.py
CHANGED
@@ -1,864 +1,115 @@
|
|
1 |
-
|
2 |
-
nohup python3 app.py &
|
3 |
-
"""
|
4 |
-
|
5 |
-
import re
|
6 |
-
import requests
|
7 |
-
from typing import Dict
|
8 |
-
from collections import defaultdict
|
9 |
-
from datetime import date
|
10 |
-
import gradio as gr
|
11 |
-
from scipy.special import softmax
|
12 |
-
import language_tool_python
|
13 |
-
import nltk
|
14 |
import torch
|
15 |
-
from
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
from
|
20 |
-
from
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
}
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
return text
|
49 |
-
|
50 |
-
|
51 |
-
def remove_bracketed_numbers(text):
|
52 |
-
pattern = r"^\[\d+\]"
|
53 |
-
cleaned_text = re.sub(pattern, "", text)
|
54 |
-
return cleaned_text
|
55 |
-
|
56 |
-
|
57 |
-
def clean_text(text: str) -> str:
|
58 |
-
paragraphs = text.split("\n\n")
|
59 |
-
cleaned_paragraphs = []
|
60 |
-
for paragraph in paragraphs:
|
61 |
-
cleaned = re.sub(r"\s+", " ", paragraph).strip()
|
62 |
-
cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
|
63 |
-
cleaned_paragraphs.append(cleaned)
|
64 |
-
return "\n".join(cleaned_paragraphs)
|
65 |
-
|
66 |
-
|
67 |
-
def split_text_from_refs(text: str, sep="\n"):
|
68 |
-
lines = text.split("\n")
|
69 |
-
references = []
|
70 |
-
article_text = []
|
71 |
-
index_pattern = re.compile(r"\[(\d+)\]")
|
72 |
-
in_references = False
|
73 |
-
|
74 |
-
for line in lines:
|
75 |
-
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
76 |
-
in_references = True
|
77 |
-
continue
|
78 |
-
if line.strip().lower().startswith("references:"):
|
79 |
-
in_references = True
|
80 |
-
if in_references:
|
81 |
-
matches = index_pattern.split(line)
|
82 |
-
for match in matches:
|
83 |
-
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
84 |
-
references.append(match.strip())
|
85 |
-
else:
|
86 |
-
article_text.append(line)
|
87 |
-
|
88 |
-
formatted_refs = []
|
89 |
-
for i, ref in enumerate(references, 1):
|
90 |
-
ref = remove_bracketed_numbers(ref)
|
91 |
-
formatted_refs.append(f"[{i}] {ref}{sep}")
|
92 |
-
|
93 |
-
return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
|
94 |
-
|
95 |
-
|
96 |
-
def ends_with_references(text):
|
97 |
-
# Define a regular expression pattern for variations of "References:"
|
98 |
-
pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
|
99 |
-
|
100 |
-
# Check if the text ends with any form of "References:"
|
101 |
-
return bool(pattern.search(text.strip()))
|
102 |
-
|
103 |
-
|
104 |
-
def format_and_correct_language_check(text: str) -> str:
|
105 |
-
return tool.correct(text)
|
106 |
-
|
107 |
-
|
108 |
-
def predict(model, tokenizer, text):
|
109 |
-
text = remove_special_characters(text)
|
110 |
-
bc_token_size = 256
|
111 |
-
with torch.no_grad():
|
112 |
-
model.eval()
|
113 |
-
tokens = tokenizer(
|
114 |
-
text,
|
115 |
-
padding="max_length",
|
116 |
-
truncation=True,
|
117 |
-
max_length=bc_token_size,
|
118 |
-
return_tensors="pt",
|
119 |
-
).to(device)
|
120 |
-
output = model(**tokens)
|
121 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
122 |
-
output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
|
123 |
-
return output_norm
|
124 |
-
|
125 |
-
|
126 |
-
def ai_generated_test(text, model="BC Original"):
|
127 |
-
return predict(models[model], tokenizers[model], text)
|
128 |
-
|
129 |
-
|
130 |
-
def detection_polygraf(text, model="BC Original"):
|
131 |
-
# sentences = split_into_sentences(text)
|
132 |
-
sentences = nltk.sent_tokenize(text)
|
133 |
-
num_sentences = len(sentences)
|
134 |
-
scores = defaultdict(list)
|
135 |
-
|
136 |
-
overall_scores = []
|
137 |
-
|
138 |
-
# Process each chunk of 3 sentences and store the score for each sentence in the chunk
|
139 |
-
for i in range(num_sentences):
|
140 |
-
chunk = " ".join(sentences[i : i + 3])
|
141 |
-
if chunk:
|
142 |
-
# result = classifier(chunk)
|
143 |
-
result = ai_generated_test(chunk, model)
|
144 |
-
score = result["AI"]
|
145 |
-
for j in range(i, min(i + 3, num_sentences)):
|
146 |
-
scores[j].append(score)
|
147 |
-
|
148 |
-
# Calculate the average score for each sentence and apply color coding
|
149 |
-
paragraphs = text.split("\n")
|
150 |
-
paragraphs = [s for s in paragraphs if s.strip()]
|
151 |
-
colored_paragraphs = []
|
152 |
-
i = 0
|
153 |
-
for paragraph in paragraphs:
|
154 |
-
temp_sentences = nltk.sent_tokenize(paragraph)
|
155 |
-
colored_sentences = []
|
156 |
-
for sentence in temp_sentences:
|
157 |
-
if scores[i]:
|
158 |
-
avg_score = sum(scores[i]) / len(scores[i])
|
159 |
-
if avg_score >= 0.65:
|
160 |
-
colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
|
161 |
-
else:
|
162 |
-
colored_sentence = sentence
|
163 |
-
colored_sentences.append(colored_sentence)
|
164 |
-
overall_scores.append(avg_score)
|
165 |
-
i = i + 1
|
166 |
-
combined_sentences = " ".join(colored_sentences)
|
167 |
-
colored_paragraphs.append(combined_sentences)
|
168 |
-
|
169 |
-
overall_score = sum(overall_scores) / len(overall_scores)
|
170 |
-
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
|
171 |
-
return overall_score, "<br><br>".join(colored_paragraphs)
|
172 |
-
|
173 |
-
|
174 |
-
ai_check_options = [
|
175 |
-
"Polygraf AI (Base Model)",
|
176 |
-
"Polygraf AI (Advanced Model)",
|
177 |
-
]
|
178 |
-
|
179 |
-
|
180 |
-
def ai_generated_test_sapling(text: str) -> Dict:
|
181 |
-
response = requests.post(
|
182 |
-
"https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
|
183 |
-
)
|
184 |
-
return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
|
185 |
-
|
186 |
-
|
187 |
-
class GPT2PPL:
|
188 |
-
def __init__(self):
|
189 |
-
self.device = device
|
190 |
-
self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
|
191 |
-
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
192 |
-
|
193 |
-
def __call__(self, text):
|
194 |
-
encodings = self.tokenizer(text, return_tensors="pt")
|
195 |
-
encodings = {k: v.to(self.device) for k, v in encodings.items()}
|
196 |
-
max_length = self.model.config.n_positions
|
197 |
-
stride = 512
|
198 |
-
seq_len = encodings.input_ids.size(1)
|
199 |
-
|
200 |
-
nlls = []
|
201 |
-
for i in range(0, seq_len, stride):
|
202 |
-
begin_loc = max(i + stride - max_length, 0)
|
203 |
-
end_loc = min(i + stride, seq_len)
|
204 |
-
trg_len = end_loc - i
|
205 |
-
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
|
206 |
-
target_ids = input_ids.clone()
|
207 |
-
target_ids[:, :-trg_len] = -100
|
208 |
-
|
209 |
-
with torch.no_grad():
|
210 |
-
outputs = self.model(input_ids, labels=target_ids)
|
211 |
-
neg_log_likelihood = outputs.loss * trg_len
|
212 |
-
|
213 |
-
nlls.append(neg_log_likelihood)
|
214 |
-
|
215 |
-
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
|
216 |
-
return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
|
217 |
-
|
218 |
-
|
219 |
-
def ai_generated_test_gptzero(text):
|
220 |
-
gptzero_model = GPT2PPL()
|
221 |
-
result = gptzero_model(text)
|
222 |
-
return result, None
|
223 |
-
|
224 |
-
|
225 |
-
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
226 |
-
body, references = split_text_from_refs(text, "<br>")
|
227 |
-
score, text = detection_polygraf(text=body, model=model)
|
228 |
-
text = text + "<br>" + references
|
229 |
-
return score, text
|
230 |
-
|
231 |
-
|
232 |
-
def ai_check(text: str, option: str):
|
233 |
-
if option.startswith("Polygraf AI"):
|
234 |
-
return highlighter_polygraf(text, option)
|
235 |
-
elif option == "Sapling AI":
|
236 |
-
return ai_generated_test_sapling(text)
|
237 |
-
elif option == "GPTZero":
|
238 |
-
return ai_generated_test_gptzero(text)
|
239 |
else:
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
Content:
|
254 |
-
- Depth: {settings['depth_of_content']}
|
255 |
-
- Structure: {', '.join(settings['structure'])}
|
256 |
-
|
257 |
-
Keywords to incorporate:
|
258 |
-
{', '.join(settings['keywords'])}
|
259 |
-
|
260 |
-
Additional requirements:
|
261 |
-
- Don't start with "Here is a...", start with the requested text directly
|
262 |
-
- Include {settings['num_examples']} relevant examples or case studies
|
263 |
-
- Incorporate data or statistics from {', '.join(settings['references'])}
|
264 |
-
- End with a {settings['conclusion_type']} conclusion
|
265 |
-
- Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
|
266 |
-
- Do not make any headline, title bold.
|
267 |
-
{settings['sources']}
|
268 |
-
|
269 |
-
Ensure proper paragraph breaks for better readability.
|
270 |
-
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
271 |
-
"""
|
272 |
-
return prompt
|
273 |
-
|
274 |
-
|
275 |
-
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
276 |
-
prompt = f"""
|
277 |
-
I am a {settings['role']}
|
278 |
-
"{settings['generated_article']}"
|
279 |
-
Edit the given text based on user comments.
|
280 |
-
|
281 |
-
Comments:
|
282 |
-
- Don't start with "Here is a...", start with the requested text directly
|
283 |
-
- {settings['user_comments']}
|
284 |
-
- The original content should not be changed. Make minor modifications based on user comments above.
|
285 |
-
- Keep the references the same as the given text in the same format.
|
286 |
-
- Do not make any headline, title bold.
|
287 |
-
{settings['sources']}
|
288 |
-
|
289 |
-
Ensure proper paragraph breaks for better readability.
|
290 |
-
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
291 |
-
"""
|
292 |
-
return prompt
|
293 |
-
|
294 |
-
|
295 |
-
def generate_article(
|
296 |
-
input_role: str,
|
297 |
-
topic: str,
|
298 |
-
keywords: str,
|
299 |
-
article_length: str,
|
300 |
-
format: str,
|
301 |
-
writing_style: str,
|
302 |
-
tone: str,
|
303 |
-
user_category: str,
|
304 |
-
depth_of_content: str,
|
305 |
-
structure: str,
|
306 |
-
references: str,
|
307 |
-
num_examples: str,
|
308 |
-
conclusion_type: str,
|
309 |
-
ai_model: str,
|
310 |
-
content_string: str,
|
311 |
-
# api_key: str = None,
|
312 |
-
pdf_file_input=None,
|
313 |
-
generated_article: str = None,
|
314 |
-
user_comments: str = None,
|
315 |
-
) -> str:
|
316 |
-
settings = {
|
317 |
-
"role": input_role,
|
318 |
-
"topic": topic,
|
319 |
-
"keywords": [k.strip() for k in keywords.split(",")],
|
320 |
-
"article_length": article_length,
|
321 |
-
"format": format,
|
322 |
-
"writing_style": writing_style,
|
323 |
-
"tone": tone,
|
324 |
-
"user_category": user_category,
|
325 |
-
"depth_of_content": depth_of_content,
|
326 |
-
"structure": [s.strip() for s in structure.split(",")],
|
327 |
-
"references": [r.strip() for r in references.split(",")],
|
328 |
-
"num_examples": num_examples,
|
329 |
-
"conclusion_type": conclusion_type,
|
330 |
-
"sources": content_string,
|
331 |
-
"generated_article": generated_article,
|
332 |
-
"user_comments": user_comments,
|
333 |
-
}
|
334 |
-
|
335 |
-
if generated_article:
|
336 |
-
prompt = regenerate_prompt(settings)
|
337 |
-
else:
|
338 |
-
prompt = generate_prompt(settings)
|
339 |
-
|
340 |
-
print("Generated Prompt...\n", prompt)
|
341 |
-
article = generate(
|
342 |
-
prompt,
|
343 |
-
ai_model,
|
344 |
-
pdf_file_input, # api_key
|
345 |
-
)
|
346 |
-
|
347 |
-
return clean_text(article)
|
348 |
-
|
349 |
-
|
350 |
-
def humanize(
|
351 |
-
text: str,
|
352 |
-
model: str,
|
353 |
-
temperature: float = 1.2,
|
354 |
-
repetition_penalty: float = 1,
|
355 |
-
top_k: int = 50,
|
356 |
-
length_penalty: float = 1,
|
357 |
-
) -> str:
|
358 |
-
body, references = split_text_from_refs(text)
|
359 |
-
result = paraphrase_text(
|
360 |
-
text=body,
|
361 |
-
model_name=model,
|
362 |
temperature=temperature,
|
363 |
repetition_penalty=repetition_penalty,
|
|
|
364 |
top_k=top_k,
|
365 |
length_penalty=length_penalty,
|
366 |
)
|
367 |
-
|
368 |
-
return
|
369 |
-
|
370 |
-
|
371 |
-
def
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
lines = text.split("\n")
|
380 |
-
references = []
|
381 |
-
article_text = []
|
382 |
-
index_pattern = re.compile(r"\[(\d+)\]")
|
383 |
-
in_references = False
|
384 |
-
|
385 |
-
for line in lines:
|
386 |
-
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
387 |
-
in_references = True
|
388 |
-
continue
|
389 |
-
if line.strip().lower().startswith("references:"):
|
390 |
-
in_references = True
|
391 |
-
if in_references:
|
392 |
-
matches = index_pattern.split(line)
|
393 |
-
for match in matches:
|
394 |
-
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
395 |
-
references.append(match.strip())
|
396 |
-
else:
|
397 |
-
article_text.append(line)
|
398 |
-
|
399 |
-
formatted_refs = []
|
400 |
-
for i, ref in enumerate(references, 1):
|
401 |
-
ref = remove_bracketed_numbers(ref)
|
402 |
-
formatted_refs.append(f"[{i}] {ref}\n")
|
403 |
-
|
404 |
-
return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
|
405 |
-
|
406 |
-
|
407 |
-
def generate_and_format(
|
408 |
-
input_role,
|
409 |
-
topic,
|
410 |
-
keywords,
|
411 |
-
article_length,
|
412 |
-
format,
|
413 |
-
writing_style,
|
414 |
-
tone,
|
415 |
-
user_category,
|
416 |
-
depth_of_content,
|
417 |
-
structure,
|
418 |
-
references,
|
419 |
-
num_examples,
|
420 |
-
conclusion_type,
|
421 |
-
ai_model,
|
422 |
-
# api_key,
|
423 |
-
google_search_check,
|
424 |
-
year_from,
|
425 |
-
month_from,
|
426 |
-
day_from,
|
427 |
-
year_to,
|
428 |
-
month_to,
|
429 |
-
day_to,
|
430 |
-
domains_to_include,
|
431 |
-
include_sites,
|
432 |
-
exclude_sites,
|
433 |
-
pdf_file_input,
|
434 |
-
generated_article: str = None,
|
435 |
-
user_comments: str = None,
|
436 |
):
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
|
446 |
-
final_query += " " + " OR ".join(site_queries)
|
447 |
-
if exclude_sites:
|
448 |
-
exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
|
449 |
-
final_query += " " + " ".join(exclude_queries)
|
450 |
-
print(f"Google Search Query: {final_query}")
|
451 |
-
url_content = google_search(final_query, sorted_date, domains_to_include)
|
452 |
-
content_string = "\n".join(
|
453 |
-
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
|
454 |
-
)
|
455 |
-
content_string = (
|
456 |
-
"Use the trusted information here from the URLs and add them as References:\n" + content_string
|
457 |
-
)
|
458 |
-
article = generate_article(
|
459 |
-
input_role,
|
460 |
-
topic,
|
461 |
-
keywords,
|
462 |
-
article_length,
|
463 |
-
format,
|
464 |
-
writing_style,
|
465 |
-
tone,
|
466 |
-
user_category,
|
467 |
-
depth_of_content,
|
468 |
-
structure,
|
469 |
-
references,
|
470 |
-
num_examples,
|
471 |
-
conclusion_type,
|
472 |
-
ai_model,
|
473 |
-
content_string,
|
474 |
-
# api_key,
|
475 |
-
pdf_file_input,
|
476 |
-
generated_article,
|
477 |
-
user_comments,
|
478 |
-
)
|
479 |
-
if ends_with_references(article) and url_content is not None:
|
480 |
-
for url in url_content.keys():
|
481 |
-
article += f"\n{url}"
|
482 |
-
|
483 |
-
return format_references(article)
|
484 |
-
|
485 |
-
|
486 |
-
def create_interface():
|
487 |
-
with gr.Blocks(
|
488 |
-
theme=gr.themes.Default(
|
489 |
-
primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
|
490 |
-
),
|
491 |
-
css="""
|
492 |
-
.input-highlight-pink block_label {background-color: #008080}
|
493 |
-
""",
|
494 |
-
) as demo:
|
495 |
-
today = date.today()
|
496 |
-
# dd/mm/YY
|
497 |
-
d1 = today.strftime("%d/%B/%Y")
|
498 |
-
d1 = d1.split("/")
|
499 |
-
gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
|
500 |
-
|
501 |
-
with gr.Row():
|
502 |
-
with gr.Column(scale=2):
|
503 |
-
with gr.Group():
|
504 |
-
gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
|
505 |
-
input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
|
506 |
-
input_topic = gr.Textbox(
|
507 |
-
label="Topic",
|
508 |
-
placeholder="Enter the main topic of your article",
|
509 |
-
elem_classes="input-highlight-pink",
|
510 |
-
)
|
511 |
-
input_keywords = gr.Textbox(
|
512 |
-
label="Keywords",
|
513 |
-
placeholder="Enter comma-separated keywords",
|
514 |
-
elem_classes="input-highlight-yellow",
|
515 |
-
)
|
516 |
-
|
517 |
-
with gr.Row():
|
518 |
-
input_format = gr.Dropdown(
|
519 |
-
choices=[
|
520 |
-
"Article",
|
521 |
-
"Essay",
|
522 |
-
"Blog post",
|
523 |
-
"Report",
|
524 |
-
"Research paper",
|
525 |
-
"News article",
|
526 |
-
"White paper",
|
527 |
-
"LinkedIn post",
|
528 |
-
"X (Twitter) post",
|
529 |
-
"Instagram Video Content",
|
530 |
-
"TikTok Video Content",
|
531 |
-
"Facebook post",
|
532 |
-
],
|
533 |
-
value="Article",
|
534 |
-
label="Format",
|
535 |
-
elem_classes="input-highlight-turquoise",
|
536 |
-
)
|
537 |
-
|
538 |
-
input_length = gr.Slider(
|
539 |
-
minimum=50,
|
540 |
-
maximum=5000,
|
541 |
-
step=50,
|
542 |
-
value=300,
|
543 |
-
label="Article Length",
|
544 |
-
elem_classes="input-highlight-pink",
|
545 |
-
)
|
546 |
-
|
547 |
-
with gr.Row():
|
548 |
-
input_writing_style = gr.Dropdown(
|
549 |
-
choices=[
|
550 |
-
"Formal",
|
551 |
-
"Informal",
|
552 |
-
"Technical",
|
553 |
-
"Conversational",
|
554 |
-
"Journalistic",
|
555 |
-
"Academic",
|
556 |
-
"Creative",
|
557 |
-
],
|
558 |
-
value="Formal",
|
559 |
-
label="Writing Style",
|
560 |
-
elem_classes="input-highlight-yellow",
|
561 |
-
)
|
562 |
-
input_tone = gr.Dropdown(
|
563 |
-
choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
|
564 |
-
value="Professional",
|
565 |
-
label="Tone",
|
566 |
-
elem_classes="input-highlight-turquoise",
|
567 |
-
)
|
568 |
-
|
569 |
-
input_user_category = gr.Dropdown(
|
570 |
-
choices=[
|
571 |
-
"Students",
|
572 |
-
"Professionals",
|
573 |
-
"Researchers",
|
574 |
-
"General Public",
|
575 |
-
"Policymakers",
|
576 |
-
"Entrepreneurs",
|
577 |
-
],
|
578 |
-
value="General Public",
|
579 |
-
label="Target Audience",
|
580 |
-
elem_classes="input-highlight-pink",
|
581 |
-
)
|
582 |
-
input_depth = gr.Dropdown(
|
583 |
-
choices=[
|
584 |
-
"Surface-level overview",
|
585 |
-
"Moderate analysis",
|
586 |
-
"In-depth research",
|
587 |
-
"Comprehensive study",
|
588 |
-
],
|
589 |
-
value="Moderate analysis",
|
590 |
-
label="Depth of Content",
|
591 |
-
elem_classes="input-highlight-yellow",
|
592 |
-
)
|
593 |
-
input_structure = gr.Dropdown(
|
594 |
-
choices=[
|
595 |
-
"Introduction, Body, Conclusion",
|
596 |
-
"Abstract, Introduction, Methods, Results, Discussion, Conclusion",
|
597 |
-
"Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
|
598 |
-
"Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
|
599 |
-
],
|
600 |
-
value="Introduction, Body, Conclusion",
|
601 |
-
label="Structure",
|
602 |
-
elem_classes="input-highlight-turquoise",
|
603 |
-
)
|
604 |
-
input_references = gr.Dropdown(
|
605 |
-
choices=[
|
606 |
-
"Academic journals",
|
607 |
-
"Industry reports",
|
608 |
-
"Government publications",
|
609 |
-
"News outlets",
|
610 |
-
"Expert interviews",
|
611 |
-
"Case studies",
|
612 |
-
],
|
613 |
-
value="News outlets",
|
614 |
-
label="References",
|
615 |
-
elem_classes="input-highlight-pink",
|
616 |
-
)
|
617 |
-
input_num_examples = gr.Dropdown(
|
618 |
-
choices=["1-2", "3-4", "5+"],
|
619 |
-
value="1-2",
|
620 |
-
label="Number of Examples/Case Studies",
|
621 |
-
elem_classes="input-highlight-yellow",
|
622 |
-
)
|
623 |
-
input_conclusion = gr.Dropdown(
|
624 |
-
choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
|
625 |
-
value="Call to Action",
|
626 |
-
label="Conclusion Type",
|
627 |
-
elem_classes="input-highlight-turquoise",
|
628 |
-
)
|
629 |
-
gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
|
630 |
-
with gr.Row():
|
631 |
-
google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
|
632 |
-
with gr.Group(visible=True) as search_options:
|
633 |
-
with gr.Row():
|
634 |
-
include_sites = gr.Textbox(
|
635 |
-
label="Include Specific Websites",
|
636 |
-
placeholder="Enter comma-separated keywords",
|
637 |
-
elem_classes="input-highlight-yellow",
|
638 |
-
)
|
639 |
-
with gr.Row():
|
640 |
-
exclude_sites = gr.Textbox(
|
641 |
-
label="Exclude Specific Websites",
|
642 |
-
placeholder="Enter comma-separated keywords",
|
643 |
-
elem_classes="input-highlight-yellow",
|
644 |
-
)
|
645 |
-
with gr.Row():
|
646 |
-
domains_to_include = gr.Dropdown(
|
647 |
-
domain_list,
|
648 |
-
value=domain_list,
|
649 |
-
multiselect=True,
|
650 |
-
label="Domains To Include",
|
651 |
-
)
|
652 |
-
with gr.Row():
|
653 |
-
month_from = gr.Dropdown(
|
654 |
-
choices=months,
|
655 |
-
label="From Month",
|
656 |
-
value="January",
|
657 |
-
interactive=True,
|
658 |
-
)
|
659 |
-
day_from = gr.Textbox(label="From Day", value="01")
|
660 |
-
year_from = gr.Textbox(label="From Year", value="2000")
|
661 |
-
|
662 |
-
with gr.Row():
|
663 |
-
month_to = gr.Dropdown(
|
664 |
-
choices=months,
|
665 |
-
label="To Month",
|
666 |
-
value=d1[1],
|
667 |
-
interactive=True,
|
668 |
-
)
|
669 |
-
day_to = gr.Textbox(label="To Day", value=d1[0])
|
670 |
-
year_to = gr.Textbox(label="To Year", value=d1[2])
|
671 |
-
|
672 |
-
gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
|
673 |
-
pdf_file_input = gr.File(label="Upload PDF")
|
674 |
-
|
675 |
-
with gr.Group():
|
676 |
-
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
677 |
-
ai_generator = gr.Dropdown(
|
678 |
-
choices=[
|
679 |
-
"OpenAI GPT 4",
|
680 |
-
"OpenAI GPT 4o",
|
681 |
-
"OpenAI GPT 4o Mini",
|
682 |
-
"Claude Sonnet 3.5",
|
683 |
-
"Gemini 1.5 Pro",
|
684 |
-
"LLaMA 3",
|
685 |
-
],
|
686 |
-
value="OpenAI GPT 4o Mini",
|
687 |
-
label="AI Model",
|
688 |
-
elem_classes="input-highlight-pink",
|
689 |
-
)
|
690 |
-
# input_api = gr.Textbox(label="API Key", visible=False)
|
691 |
-
# ai_generator.change(update_visibility_api, ai_generator, input_api)
|
692 |
-
|
693 |
-
generate_btn = gr.Button("Generate Article", variant="primary")
|
694 |
-
|
695 |
-
with gr.Accordion("Advanced Humanizer Settings", open=False):
|
696 |
-
with gr.Row():
|
697 |
-
model_dropdown = gr.Radio(
|
698 |
-
choices=[
|
699 |
-
"Base Model",
|
700 |
-
"Large Model",
|
701 |
-
"XL Model",
|
702 |
-
# "XL Law Model",
|
703 |
-
# "XL Marketing Model",
|
704 |
-
# "XL Child Style Model",
|
705 |
-
],
|
706 |
-
value="Large Model",
|
707 |
-
label="Humanizer Model Version",
|
708 |
-
)
|
709 |
-
with gr.Row():
|
710 |
-
temperature_slider = gr.Slider(
|
711 |
-
minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature"
|
712 |
-
)
|
713 |
-
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k")
|
714 |
-
with gr.Row():
|
715 |
-
repetition_penalty_slider = gr.Slider(
|
716 |
-
minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
|
717 |
-
)
|
718 |
-
length_penalty_slider = gr.Slider(
|
719 |
-
minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
|
720 |
-
)
|
721 |
-
|
722 |
-
with gr.Column(scale=3):
|
723 |
-
output_article = gr.Textbox(label="Generated Article", lines=20)
|
724 |
-
ai_comments = gr.Textbox(
|
725 |
-
label="Add comments to help edit generated text", interactive=True, visible=False
|
726 |
-
)
|
727 |
-
regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
|
728 |
-
ai_detector_dropdown = gr.Radio(
|
729 |
-
choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
|
730 |
-
)
|
731 |
-
ai_check_btn = gr.Button("AI Check")
|
732 |
-
|
733 |
-
with gr.Accordion("AI Detection Results", open=True):
|
734 |
-
ai_check_result = gr.Label(label="AI Check Result")
|
735 |
-
highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
|
736 |
-
humanize_btn = gr.Button("Humanize")
|
737 |
-
# humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
|
738 |
-
humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
|
739 |
-
copy_to_input_btn = gr.Button("Copy to Input for AI Check")
|
740 |
-
|
741 |
-
def regenerate_visible(text):
|
742 |
-
if text:
|
743 |
-
return gr.update(visible=True)
|
744 |
-
else:
|
745 |
-
return gr.update(visible=False)
|
746 |
-
|
747 |
-
def highlight_visible(text):
|
748 |
-
if text.startswith("Polygraf"):
|
749 |
-
return gr.update(visible=True)
|
750 |
-
else:
|
751 |
-
return gr.update(visible=False)
|
752 |
-
|
753 |
-
def search_visible(toggle):
|
754 |
-
if toggle:
|
755 |
-
return gr.update(visible=True)
|
756 |
-
else:
|
757 |
-
return gr.update(visible=False)
|
758 |
-
|
759 |
-
google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
|
760 |
-
ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
|
761 |
-
output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
|
762 |
-
ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
|
763 |
-
ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
|
764 |
-
|
765 |
-
generate_btn.click(
|
766 |
-
fn=generate_and_format,
|
767 |
-
inputs=[
|
768 |
-
input_role,
|
769 |
-
input_topic,
|
770 |
-
input_keywords,
|
771 |
-
input_length,
|
772 |
-
input_format,
|
773 |
-
input_writing_style,
|
774 |
-
input_tone,
|
775 |
-
input_user_category,
|
776 |
-
input_depth,
|
777 |
-
input_structure,
|
778 |
-
input_references,
|
779 |
-
input_num_examples,
|
780 |
-
input_conclusion,
|
781 |
-
ai_generator,
|
782 |
-
# input_api,
|
783 |
-
google_search_check,
|
784 |
-
year_from,
|
785 |
-
month_from,
|
786 |
-
day_from,
|
787 |
-
year_to,
|
788 |
-
month_to,
|
789 |
-
day_to,
|
790 |
-
domains_to_include,
|
791 |
-
include_sites,
|
792 |
-
exclude_sites,
|
793 |
-
pdf_file_input,
|
794 |
-
],
|
795 |
-
outputs=[output_article],
|
796 |
-
)
|
797 |
-
|
798 |
-
regenerate_btn.click(
|
799 |
-
fn=generate_and_format,
|
800 |
-
inputs=[
|
801 |
-
input_role,
|
802 |
-
input_topic,
|
803 |
-
input_keywords,
|
804 |
-
input_length,
|
805 |
-
input_format,
|
806 |
-
input_writing_style,
|
807 |
-
input_tone,
|
808 |
-
input_user_category,
|
809 |
-
input_depth,
|
810 |
-
input_structure,
|
811 |
-
input_references,
|
812 |
-
input_num_examples,
|
813 |
-
input_conclusion,
|
814 |
-
ai_generator,
|
815 |
-
# input_api,
|
816 |
-
google_search_check,
|
817 |
-
year_from,
|
818 |
-
month_from,
|
819 |
-
day_from,
|
820 |
-
year_to,
|
821 |
-
month_to,
|
822 |
-
day_to,
|
823 |
-
domains_to_include,
|
824 |
-
pdf_file_input,
|
825 |
-
output_article,
|
826 |
-
include_sites,
|
827 |
-
exclude_sites,
|
828 |
-
ai_comments,
|
829 |
-
],
|
830 |
-
outputs=[output_article],
|
831 |
-
)
|
832 |
-
|
833 |
-
ai_check_btn.click(
|
834 |
-
fn=ai_check,
|
835 |
-
inputs=[output_article, ai_detector_dropdown],
|
836 |
-
outputs=[ai_check_result, highlighted_text],
|
837 |
-
)
|
838 |
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
model_dropdown,
|
844 |
-
temperature_slider,
|
845 |
-
repetition_penalty_slider,
|
846 |
-
top_k_slider,
|
847 |
-
length_penalty_slider,
|
848 |
-
],
|
849 |
-
outputs=[humanized_output],
|
850 |
-
)
|
851 |
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
856 |
)
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
#
|
864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import torch
|
3 |
+
from nltk import sent_tokenize
|
4 |
+
import nltk
|
5 |
+
from tqdm import tqdm
|
6 |
+
import gradio as gr
|
7 |
+
from peft import PeftModel
|
8 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
9 |
+
|
10 |
+
nltk.download("punkt")
|
11 |
+
# autodetect the available device
|
12 |
+
GPU_IDX = 1 # which GPU to use
|
13 |
+
if torch.cuda.is_available():
|
14 |
+
num_gpus = torch.cuda.device_count()
|
15 |
+
print(f"Number of available GPUs: {num_gpus}")
|
16 |
+
assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
|
17 |
+
device = torch.device(f"cuda:{GPU_IDX}")
|
18 |
+
print(f"Using GPU: {GPU_IDX}")
|
19 |
+
else:
|
20 |
+
print("CUDA is not available. Using CPU instead.")
|
21 |
+
device = torch.device("cpu")
|
22 |
+
|
23 |
+
batch_size = 64
|
24 |
+
|
25 |
+
# Configuration for models and their adapters
|
26 |
+
model_config = {
|
27 |
+
"Base Model": "polygraf-ai/poly-humanizer-base",
|
28 |
+
"Large Model": "polygraf-ai/poly-humanizer-large",
|
29 |
+
"XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
|
30 |
}
|
31 |
|
32 |
+
# cache the base models, tokenizers, and adapters
|
33 |
+
# initialize model and tokenizer
|
34 |
+
models, tokenizers = {}, {}
|
35 |
+
for name, path in model_config.items():
|
36 |
+
if name == "XL Model":
|
37 |
+
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
|
38 |
+
model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
|
39 |
+
model = model.merge_and_unload()
|
40 |
+
models[name] = model
|
41 |
+
tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
else:
|
43 |
+
model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
|
44 |
+
models[name] = model
|
45 |
+
tokenizers[name] = T5Tokenizer.from_pretrained(path)
|
46 |
+
print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")
|
47 |
+
|
48 |
+
|
49 |
+
def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
|
50 |
+
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
|
51 |
+
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
|
52 |
+
outputs = model.generate(
|
53 |
+
**inputs,
|
54 |
+
do_sample=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
temperature=temperature,
|
56 |
repetition_penalty=repetition_penalty,
|
57 |
+
max_length=128,
|
58 |
top_k=top_k,
|
59 |
length_penalty=length_penalty,
|
60 |
)
|
61 |
+
answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
62 |
+
return answers
|
63 |
+
|
64 |
+
|
65 |
+
def paraphrase_text(
|
66 |
+
text,
|
67 |
+
progress=gr.Progress(),
|
68 |
+
model_name="Base Model",
|
69 |
+
temperature=1.2,
|
70 |
+
repetition_penalty=1.0,
|
71 |
+
top_k=50,
|
72 |
+
length_penalty=1.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
):
|
74 |
+
"""
|
75 |
+
Optimization here is to feed all sentences at once to the model.
|
76 |
+
Paragraphs are stored as a number of sentences per paragraph.
|
77 |
+
"""
|
78 |
+
progress(0, desc="Starting to Humanize")
|
79 |
+
# Select the model, tokenizer, and adapter
|
80 |
+
tokenizer = tokenizers[model_name]
|
81 |
+
model = models[model_name].to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
# Split the text into paragraphs and then into sentences
|
84 |
+
paragraphs = text.split("\n")
|
85 |
+
all_sentences = []
|
86 |
+
sentences_per_paragraph = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
for paragraph in paragraphs:
|
89 |
+
sentences = sent_tokenize(paragraph)
|
90 |
+
sentences_per_paragraph.append(len(sentences))
|
91 |
+
all_sentences.extend(sentences)
|
92 |
+
|
93 |
+
# Process all sentences in batches
|
94 |
+
paraphrased_sentences = []
|
95 |
+
for i in progress.tqdm(range(0, len(all_sentences), batch_size)):
|
96 |
+
batch_sentences = all_sentences[i : i + batch_size]
|
97 |
+
paraphrased_batch = paraphrase_sentences(
|
98 |
+
model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
|
99 |
)
|
100 |
+
paraphrased_sentences.extend(paraphrased_batch)
|
101 |
+
|
102 |
+
# Clear memory
|
103 |
+
torch.cuda.empty_cache()
|
104 |
+
gc.collect()
|
105 |
+
|
106 |
+
# Reconstruct paragraphs
|
107 |
+
humanized_paragraphs = []
|
108 |
+
sentence_index = 0
|
109 |
+
for num_sentences in sentences_per_paragraph:
|
110 |
+
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
|
111 |
+
humanized_paragraphs.append(humanized_paragraph)
|
112 |
+
sentence_index += num_sentences
|
113 |
+
|
114 |
+
humanized_text = "\n".join(humanized_paragraphs)
|
115 |
+
return humanized_text
|