eljanmahammadli commited on
Commit
e2a79fa
·
1 Parent(s): db77dd7

added new decoder only LM as a humanizer + UI suport

Browse files
Files changed (2) hide show
  1. app.py +36 -31
  2. humanize.py +120 -35
app.py CHANGED
@@ -4,22 +4,21 @@ export GOOGLE_APPLICATION_CREDENTIALS="gcp_creds.json"
4
  """
5
 
6
  import re
7
- import requests
8
  from typing import Dict
9
  from collections import defaultdict
10
  from datetime import date, datetime
 
11
  import gradio as gr
12
- from scipy.special import softmax
13
- import language_tool_python
14
  import nltk
15
  import torch
16
  import numpy as np
17
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
18
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
19
 
20
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
21
  from google_search import google_search, months, domain_list, build_date
22
- from humanize import paraphrase_text, device
23
  from ai_generate import generate
24
 
25
  print(f"Using device: {device}")
@@ -115,7 +114,6 @@ def split_text_from_refs(text: str, sep="\n"):
115
  def ends_with_references(text):
116
  # Define a regular expression pattern for variations of "References:"
117
  pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
118
-
119
  # Check if the text ends with any form of "References:"
120
  return bool(pattern.search(text.strip()))
121
 
@@ -400,7 +398,7 @@ def humanize(
400
  ) -> str:
401
  print("Humanizing text...")
402
  body, references = split_text_from_refs(text)
403
- result = paraphrase_text(
404
  text=body,
405
  model_name=model,
406
  temperature=temperature,
@@ -442,6 +440,13 @@ def update_structure(format_choice):
442
  return gr.update(value="Introduction, Body, Conclusion", interactive=True)
443
 
444
 
 
 
 
 
 
 
 
445
  import uuid
446
  import json
447
  from datetime import datetime
@@ -859,30 +864,6 @@ def create_interface():
859
  """
860
  generate_btn = gr.Button("Generate Article", variant="primary")
861
 
862
- with gr.Accordion("Advanced Humanizer Settings", open=False):
863
- with gr.Row():
864
- model_dropdown = gr.Radio(
865
- choices=[
866
- "Base Model",
867
- "Large Model",
868
- "XL Model",
869
- ],
870
- value="XL Model",
871
- label="Humanizer Model Version",
872
- )
873
- with gr.Row():
874
- temperature_slider = gr.Slider(
875
- minimum=0.5, maximum=2.0, step=0.1, value=1.1, label="Temperature"
876
- )
877
- top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
878
- with gr.Row():
879
- repetition_penalty_slider = gr.Slider(
880
- minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
881
- )
882
- length_penalty_slider = gr.Slider(
883
- minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
884
- )
885
-
886
  with gr.Column(scale=3):
887
  with gr.Tab("Text Generator"):
888
  output_article = gr.Textbox(label="Generated Article", lines=20)
@@ -899,6 +880,27 @@ def create_interface():
899
  ai_check_result = gr.Label(label="AI Check Result")
900
  mc_check_result = gr.Label(label="Creator Check Result")
901
  highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  humanize_btn = gr.Button("Humanize")
903
  # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
904
  # copy_to_input_btn = gr.Button("Copy to Input for AI Check")
@@ -937,7 +939,10 @@ def create_interface():
937
  ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
938
  ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
939
 
 
 
940
  input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
 
941
 
942
  generate_btn.click(
943
  fn=generate_and_format,
 
4
  """
5
 
6
  import re
 
7
  from typing import Dict
8
  from collections import defaultdict
9
  from datetime import date, datetime
10
+
11
  import gradio as gr
 
 
12
  import nltk
13
  import torch
14
  import numpy as np
15
+ from scipy.special import softmax
16
+ import language_tool_python
17
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
18
 
19
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
+ from humanize import humanize_text, device
22
  from ai_generate import generate
23
 
24
  print(f"Using device: {device}")
 
114
  def ends_with_references(text):
115
  # Define a regular expression pattern for variations of "References:"
116
  pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
 
117
  # Check if the text ends with any form of "References:"
118
  return bool(pattern.search(text.strip()))
119
 
 
398
  ) -> str:
399
  print("Humanizing text...")
400
  body, references = split_text_from_refs(text)
401
+ result = humanize_text(
402
  text=body,
403
  model_name=model,
404
  temperature=temperature,
 
440
  return gr.update(value="Introduction, Body, Conclusion", interactive=True)
441
 
442
 
443
+ def update_temperature(model_dropdown):
444
+ if model_dropdown == "Standard Model":
445
+ return gr.update(value=1.2, interactive=True)
446
+ elif model_dropdown == "Advanced Model (Beta)":
447
+ return gr.update(value=1.0, interactive=True)
448
+
449
+
450
  import uuid
451
  import json
452
  from datetime import datetime
 
864
  """
865
  generate_btn = gr.Button("Generate Article", variant="primary")
866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
867
  with gr.Column(scale=3):
868
  with gr.Tab("Text Generator"):
869
  output_article = gr.Textbox(label="Generated Article", lines=20)
 
880
  ai_check_result = gr.Label(label="AI Check Result")
881
  mc_check_result = gr.Label(label="Creator Check Result")
882
  highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
883
+
884
+ with gr.Accordion("Advanced Humanizer Settings", open=False):
885
+ with gr.Row():
886
+ model_dropdown = gr.Radio(
887
+ choices=["Standard Model", "Advanced Model (Beta)"],
888
+ value="Advanced Model (Beta)",
889
+ label="Humanizer Model Version",
890
+ )
891
+ with gr.Row():
892
+ temperature_slider = gr.Slider(
893
+ minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Temperature"
894
+ )
895
+ top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
896
+ with gr.Row():
897
+ repetition_penalty_slider = gr.Slider(
898
+ minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
899
+ )
900
+ length_penalty_slider = gr.Slider(
901
+ minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
902
+ )
903
+
904
  humanize_btn = gr.Button("Humanize")
905
  # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
906
  # copy_to_input_btn = gr.Button("Copy to Input for AI Check")
 
939
  ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
940
  ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
941
 
942
+ # Update the default structure based on the selected format
943
+ # e.g. "Plain Text" for certain formats
944
  input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
945
+ model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
946
 
947
  generate_btn.click(
948
  fn=generate_and_format,
humanize.py CHANGED
@@ -1,15 +1,17 @@
1
  import gc
2
  import torch
3
- from nltk import sent_tokenize
4
  import nltk
5
- from tqdm import tqdm
6
  import gradio as gr
7
  from peft import PeftModel
8
  from transformers import T5ForConditionalGeneration, T5Tokenizer
9
 
10
  nltk.download("punkt")
 
 
 
 
11
  # autodetect the available device
12
- GPU_IDX = 1 # which GPU to use
13
  if torch.cuda.is_available():
14
  num_gpus = torch.cuda.device_count()
15
  print(f"Number of available GPUs: {num_gpus}")
@@ -20,26 +22,34 @@ else:
20
  print("CUDA is not available. Using CPU instead.")
21
  device = torch.device("cpu")
22
 
23
- batch_size = 64
24
-
25
- # Configuration for models and their adapters
26
- model_config = {
27
- "Base Model": "polygraf-ai/poly-humanizer-base",
28
- "Large Model": "polygraf-ai/poly-humanizer-large",
29
- "XL Model": "polygraf-ai/poly-humanizer-XL-merged-v2",
30
- }
 
 
31
 
32
- # cache the base models, tokenizers, and adapters
33
- # initialize model and tokenizer
34
- models, tokenizers = {}, {}
35
- for name, path in model_config.items():
36
- model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
37
- tokenizers[name] = T5Tokenizer.from_pretrained(path)
38
- models[name] = model
39
- print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")
 
 
 
 
 
 
40
 
41
 
42
- def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
43
  inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
44
  inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
45
  outputs = model.generate(
@@ -55,10 +65,61 @@ def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_pe
55
  return answers
56
 
57
 
58
- def paraphrase_text(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  text,
60
  progress=gr.Progress(),
61
- model_name="Base Model",
62
  temperature=1.2,
63
  repetition_penalty=1.0,
64
  top_k=50,
@@ -69,15 +130,16 @@ def paraphrase_text(
69
  Paragraphs are stored as a number of sentences per paragraph.
70
  """
71
  progress(0, desc="Starting to Humanize")
72
- # Select the model, tokenizer, and adapter
73
- tokenizer = tokenizers[model_name]
74
- model = models[model_name].to(device)
 
 
75
 
76
  # Split the text into paragraphs and then into sentences
77
  paragraphs = text.split("\n")
78
  all_sentences = []
79
  sentences_per_paragraph = []
80
-
81
  for paragraph in paragraphs:
82
  sentences = sent_tokenize(paragraph)
83
  sentences_per_paragraph.append(len(sentences))
@@ -85,16 +147,39 @@ def paraphrase_text(
85
 
86
  # Process all sentences in batches
87
  paraphrased_sentences = []
88
- for i in progress.tqdm(range(0, len(all_sentences), batch_size)):
89
- batch_sentences = all_sentences[i : i + batch_size]
90
- paraphrased_batch = paraphrase_sentences(
91
- model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
92
- )
93
- paraphrased_sentences.extend(paraphrased_batch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # Clear memory
96
- torch.cuda.empty_cache()
97
- gc.collect()
 
 
 
 
 
 
98
 
99
  # Reconstruct paragraphs
100
  humanized_paragraphs = []
 
1
  import gc
2
  import torch
 
3
  import nltk
4
+ from nltk import sent_tokenize
5
  import gradio as gr
6
  from peft import PeftModel
7
  from transformers import T5ForConditionalGeneration, T5Tokenizer
8
 
9
  nltk.download("punkt")
10
+
11
+ GPU_IDX = 1 # which GPU to use, starts from 0
12
+ BATCH_SIZE = 64 # number of sentences to process in one batch
13
+
14
  # autodetect the available device
 
15
  if torch.cuda.is_available():
16
  num_gpus = torch.cuda.device_count()
17
  print(f"Number of available GPUs: {num_gpus}")
 
22
  print("CUDA is not available. Using CPU instead.")
23
  device = torch.device("cpu")
24
 
25
+ # ----------------------------
26
+ # load encoder-decoder (sequence to sequence) language model
27
+ seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
28
+ seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
29
+ seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
30
+ print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
31
+ # ----------------------------
32
+ # load decoder-only (causal) language model
33
+ from unsloth import FastLanguageModel
34
+ from unsloth.chat_templates import get_chat_template
35
 
36
+ # can only use GPU 0 when using unsloth FastLanguageModel
37
+ max_seq_length = 2048 # any can be chosed since RoPE Scaling is used
38
+ dtype = None # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+
39
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage
40
+ dec_only = "polygraf-ai/phi-3-mini-rank-128"
41
+ dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained(
42
+ model_name=dec_only,
43
+ max_seq_length=max_seq_length,
44
+ dtype=dtype,
45
+ load_in_4bit=load_in_4bit,
46
+ device_map="cuda:0",
47
+ )
48
+ FastLanguageModel.for_inference(dec_only_model) # native 2x faster inference
49
+ print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
50
 
51
 
52
+ def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
53
  inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
54
  inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
55
  outputs = model.generate(
 
65
  return answers
66
 
67
 
68
+ def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
69
+ pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
70
+ # Construct the messages_batch using the tokenized sentences
71
+ messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
72
+ # Initialize the tokenizer with the chat template
73
+ tokenizer = get_chat_template(
74
+ tokenizer,
75
+ chat_template="phi-3",
76
+ mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
77
+ )
78
+
79
+ # Enable native 2x faster inference
80
+ FastLanguageModel.for_inference(model)
81
+ # Initialize an empty list to store responses
82
+ responses = []
83
+ # Process each message individually
84
+ for message in messages_batch:
85
+ # Apply the chat template to the individual message
86
+ inputs = tokenizer.apply_chat_template(
87
+ [message], # Wrap the message in a list
88
+ tokenize=True,
89
+ add_generation_prompt=True, # Must add for generation
90
+ return_tensors="pt",
91
+ ).to("cuda")
92
+ # Generate the response for the individual message
93
+ outputs = model.generate(
94
+ input_ids=inputs,
95
+ max_new_tokens=1024,
96
+ use_cache=True,
97
+ do_sample=True,
98
+ temperature=temperature,
99
+ repetition_penalty=repetition_penalty,
100
+ top_k=top_k,
101
+ length_penalty=length_penalty,
102
+ )
103
+ # Decode the output and store it
104
+ decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
105
+ responses.append(decoded_output[0])
106
+
107
+ # Print or return the responses
108
+ generated_sentences = []
109
+ for idx, response in enumerate(responses):
110
+ generated_sentence = response.split("<|assistant|>")[1].split("<|end|>")[0].strip()
111
+ generated_sentences.append(generated_sentence)
112
+ print(sentences[idx])
113
+ print(generated_sentence)
114
+ print()
115
+
116
+ return generated_sentences
117
+
118
+
119
+ def humanize_text(
120
  text,
121
  progress=gr.Progress(),
122
+ model_name="Standard Model",
123
  temperature=1.2,
124
  repetition_penalty=1.0,
125
  top_k=50,
 
130
  Paragraphs are stored as a number of sentences per paragraph.
131
  """
132
  progress(0, desc="Starting to Humanize")
133
+
134
+ # Map model names to their respective processing functions
135
+ model_map = {"Standard Model": humanize_batch_seq2seq, "Advanced Model (Beta)": humanize_batch_decoder_only}
136
+ assert model_name in model_map, f"Invalid model name: {model_name}"
137
+ process_function = model_map[model_name]
138
 
139
  # Split the text into paragraphs and then into sentences
140
  paragraphs = text.split("\n")
141
  all_sentences = []
142
  sentences_per_paragraph = []
 
143
  for paragraph in paragraphs:
144
  sentences = sent_tokenize(paragraph)
145
  sentences_per_paragraph.append(len(sentences))
 
147
 
148
  # Process all sentences in batches
149
  paraphrased_sentences = []
150
+ current_batch_size = BATCH_SIZE
151
+ i = 0
152
+
153
+ while i < len(all_sentences):
154
+ try:
155
+ batch_sentences = all_sentences[i : i + current_batch_size]
156
+
157
+ # Call the selected processing function
158
+ paraphrased_batch = process_function(
159
+ seq2seq_model if model_name == "Standard Model" else dec_only_model,
160
+ seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer,
161
+ batch_sentences,
162
+ temperature,
163
+ repetition_penalty,
164
+ top_k,
165
+ length_penalty,
166
+ )
167
+
168
+ paraphrased_sentences.extend(paraphrased_batch)
169
+ i += current_batch_size # Move to the next batch
170
+ torch.cuda.empty_cache()
171
+ gc.collect()
172
+ progress.update(i / len(all_sentences))
173
 
174
+ except RuntimeError as e:
175
+ if "out of memory" in str(e):
176
+ # Reduce the batch size by half and retry
177
+ current_batch_size = max(1, current_batch_size // 2)
178
+ print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...")
179
+ torch.cuda.empty_cache()
180
+ gc.collect()
181
+ else:
182
+ raise e
183
 
184
  # Reconstruct paragraphs
185
  humanized_paragraphs = []