Tonic commited on
Commit
65bec20
1 Parent(s): 7c69429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -108
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import torch
2
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
3
  import gradio as gr
 
 
 
4
 
5
  # Load pre-trained model and tokenizer
6
  model_name = "PleIAs/OCRonos-Vintage"
@@ -14,7 +17,11 @@ tokenizer.pad_token = tokenizer.eos_token
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  model.to(device)
16
 
17
- # Function for generating text
 
 
 
 
18
  def historical_generation(prompt, max_new_tokens=600):
19
  prompt = f"### Text ###\n{prompt}"
20
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
@@ -45,25 +52,17 @@ def historical_generation(prompt, max_new_tokens=600):
45
 
46
  # Tokenize the generated text
47
  tokens = tokenizer.tokenize(generated_text)
48
-
49
  # Create highlighted text output
50
  highlighted_text = []
51
  for token in tokens:
52
- # Clean token and get token type
53
- clean_token = token.replace("Ġ", "")
54
  token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
55
  highlighted_text.append((clean_token, token_type))
56
-
57
- return highlighted_text
58
-
59
- # Tokenizer information display
60
- import os
61
- os.system('python -m spacy download en_core_web_sm')
62
- import spacy
63
- from spacy import displacy
64
 
65
- nlp = spacy.load("en_core_web_sm")
66
 
 
67
  def text_analysis(text):
68
  doc = nlp(text)
69
  html = displacy.render(doc, style="dep", page=True)
@@ -80,11 +79,15 @@ def text_analysis(text):
80
 
81
  return pos_tokens, pos_count, html
82
 
83
- # Gradio interface for text analysis
84
  def full_interface(prompt, max_new_tokens):
85
- generated_highlight = historical_generation(prompt, max_new_tokens)
86
- tokens, pos_count, html = text_analysis(prompt)
87
- return generated_highlight, pos_count, html
 
 
 
 
88
 
89
  # Create Gradio interface
90
  iface = gr.Interface(
@@ -109,100 +112,14 @@ iface = gr.Interface(
109
  combine_adjacent=True,
110
  show_legend=True
111
  ),
112
- gr.JSON(label="Tokenizer Info"),
113
- gr.HTML(label="Dependency Parse Visualization")
 
114
  ],
115
  title="Historical Text Generation with OCRonos-Vintage",
116
- description="Generate historical-style text using OCRonos-Vintage and analyze the tokenizer output.",
117
  theme=gr.themes.Base()
118
  )
119
 
120
  if __name__ == "__main__":
121
- iface.launch()
122
-
123
- # import torch
124
- # from transformers import GPT2LMHeadModel, GPT2Tokenizer
125
- # import gradio as gr
126
-
127
- # Load pre-trained model and tokenizer
128
- # model_name = "PleIAs/OCRonos-Vintage"
129
- # model = GPT2LMHeadModel.from_pretrained(model_name)
130
- # tokenizer = GPT2Tokenizer.from_pretrained(model_name)
131
-
132
- # Set the pad token to be the same as the eos token
133
- # tokenizer.pad_token = tokenizer.eos_token
134
-
135
- # Set the device to GPU if available, otherwise use CPU
136
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
137
- # model.to(device)
138
-
139
- # def historical_generation(prompt, max_new_tokens=600):
140
- # prompt = f"### Text ###\n{prompt}"
141
- # inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
142
- # input_ids = inputs["input_ids"].to(device)
143
- # attention_mask = inputs["attention_mask"].to(device)
144
-
145
- # Generate text
146
- # output = model.generate(
147
- # input_ids,
148
- # attention_mask=attention_mask,
149
- # max_new_tokens=max_new_tokens,
150
- # pad_token_id=tokenizer.eos_token_id,
151
- # top_k=50,
152
- # temperature=0.3,
153
- # top_p=0.95,
154
- # do_sample=True,
155
- # repetition_penalty=1.5,
156
- # bos_token_id=tokenizer.bos_token_id,
157
- # eos_token_id=tokenizer.eos_token_id
158
- # )
159
-
160
- # Decode the generated text
161
- # generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
162
-
163
- # Remove the prompt from the generated text
164
- # generated_text = generated_text.replace("### Text ###\n", "").strip()
165
-
166
- # Tokenize the generated text
167
- # tokens = tokenizer.tokenize(generated_text)
168
-
169
- # Create highlighted text output
170
- # highlighted_text = []
171
- # for token in tokens:
172
- # Remove special tokens and get the token type
173
- # clean_token = token.replace("Ġ", "").replace("</w>", "")
174
- # token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
175
-
176
- # highlighted_text.append((clean_token, token_type))
177
-
178
- # return highlighted_text
179
-
180
- # Create Gradio interface
181
- # iface = gr.Interface(
182
- # fn=historical_generation,
183
- # inputs=[
184
- # gr.Textbox(
185
- # label="Prompt",
186
- # placeholder="Enter a prompt for historical text generation...",
187
- # lines=3
188
- # ),
189
- # gr.Slider(
190
- # label="Max New Tokens",
191
- # minimum=50,
192
- # maximum=1000,
193
- # step=50,
194
- # value=600
195
- # )
196
- # ],
197
- # outputs=gr.HighlightedText(
198
- # label="Generated Historical Text",
199
- # combine_adjacent=True,
200
- # show_legend=True
201
- # ),
202
- # title="Historical Text Generation with OCRonos-Vintage",
203
- # description="Generate historical-style text using the OCRonos-Vintage model. The output shows token types as highlights.",
204
- # theme=gr.themes.Base()
205
- # )
206
-
207
- # if __name__ == "__main__":
208
- # iface.launch()
 
1
  import torch
2
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
3
  import gradio as gr
4
+ import os
5
+ import spacy
6
+ from spacy import displacy
7
 
8
  # Load pre-trained model and tokenizer
9
  model_name = "PleIAs/OCRonos-Vintage"
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  model.to(device)
19
 
20
+ # Load spaCy model for dependency parsing
21
+ os.system('python -m spacy download en_core_web_sm')
22
+ nlp = spacy.load("en_core_web_sm")
23
+
24
+ # Function for generating text and tokenizing
25
  def historical_generation(prompt, max_new_tokens=600):
26
  prompt = f"### Text ###\n{prompt}"
27
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
 
52
 
53
  # Tokenize the generated text
54
  tokens = tokenizer.tokenize(generated_text)
55
+
56
  # Create highlighted text output
57
  highlighted_text = []
58
  for token in tokens:
59
+ clean_token = token.replace("Ġ", "") # Remove "Ġ"
 
60
  token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
61
  highlighted_text.append((clean_token, token_type))
 
 
 
 
 
 
 
 
62
 
63
+ return highlighted_text, generated_text # Return both tokenized and raw generated text
64
 
65
+ # Function for dependency parsing using spaCy
66
  def text_analysis(text):
67
  doc = nlp(text)
68
  html = displacy.render(doc, style="dep", page=True)
 
79
 
80
  return pos_tokens, pos_count, html
81
 
82
+ # Full interface combining text generation and analysis
83
  def full_interface(prompt, max_new_tokens):
84
+ generated_highlight, generated_text = historical_generation(prompt, max_new_tokens)
85
+
86
+ # Dependency parse of both input and generated text
87
+ tokens_input, pos_count_input, html_input = text_analysis(prompt)
88
+ tokens_generated, pos_count_generated, html_generated = text_analysis(generated_text)
89
+
90
+ return generated_highlight, pos_count_input, html_input, html_generated
91
 
92
  # Create Gradio interface
93
  iface = gr.Interface(
 
112
  combine_adjacent=True,
113
  show_legend=True
114
  ),
115
+ gr.JSON(label="Tokenizer Info (Input Text)"),
116
+ gr.HTML(label="Dependency Parse Visualization (Input Text)"),
117
+ gr.HTML(label="Dependency Parse Visualization (Generated Text)")
118
  ],
119
  title="Historical Text Generation with OCRonos-Vintage",
120
+ description="Generate historical-style text using OCRonos-Vintage and analyze the tokenizer output, including dependency parsing.",
121
  theme=gr.themes.Base()
122
  )
123
 
124
  if __name__ == "__main__":
125
+ iface.launch()