Spaces:

Tonic
/

OCRonos-TextGen

Sleeping

App Files Files Community

Tonic commited on Sep 9, 2024

Commit

65bec20

verified ·

1 Parent(s): 7c69429

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -108

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import gradio as gr
 # Load pre-trained model and tokenizer
 model_name = "PleIAs/OCRonos-Vintage"
@@ -14,7 +17,11 @@ tokenizer.pad_token = tokenizer.eos_token
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-# Function for generating text
 def historical_generation(prompt, max_new_tokens=600):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
@@ -45,25 +52,17 @@ def historical_generation(prompt, max_new_tokens=600):
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
     # Create highlighted text output
     highlighted_text = []
     for token in tokens:
-        # Clean token and get token type
-        clean_token = token.replace("Ġ", "")
         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
         highlighted_text.append((clean_token, token_type))
-    return highlighted_text
-# Tokenizer information display
-import os
-os.system('python -m spacy download en_core_web_sm')
-import spacy
-from spacy import displacy
-nlp = spacy.load("en_core_web_sm")
 def text_analysis(text):
     doc = nlp(text)
     html = displacy.render(doc, style="dep", page=True)
@@ -80,11 +79,15 @@ def text_analysis(text):
     return pos_tokens, pos_count, html
-# Gradio interface for text analysis
 def full_interface(prompt, max_new_tokens):
-    generated_highlight = historical_generation(prompt, max_new_tokens)
-    tokens, pos_count, html = text_analysis(prompt)
-    return generated_highlight, pos_count, html
 # Create Gradio interface
 iface = gr.Interface(
@@ -109,100 +112,14 @@ iface = gr.Interface(
             combine_adjacent=True,
             show_legend=True
         ),
-        gr.JSON(label="Tokenizer Info"),
-        gr.HTML(label="Dependency Parse Visualization")
     ],
     title="Historical Text Generation with OCRonos-Vintage",
-    description="Generate historical-style text using OCRonos-Vintage and analyze the tokenizer output.",
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
-    iface.launch()
-# import torch
-# from transformers import GPT2LMHeadModel, GPT2Tokenizer
-# import gradio as gr
-# Load pre-trained model and tokenizer
-# model_name = "PleIAs/OCRonos-Vintage"
-# model = GPT2LMHeadModel.from_pretrained(model_name)
-# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-# Set the pad token to be the same as the eos token
-# tokenizer.pad_token = tokenizer.eos_token
-# Set the device to GPU if available, otherwise use CPU
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# model.to(device)
-# def historical_generation(prompt, max_new_tokens=600):
-#     prompt = f"### Text ###\n{prompt}"
-#     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
-#     input_ids = inputs["input_ids"].to(device)
-#     attention_mask = inputs["attention_mask"].to(device)
-    # Generate text
-#     output = model.generate(
-#         input_ids,
-#         attention_mask=attention_mask,
-#         max_new_tokens=max_new_tokens,
-#         pad_token_id=tokenizer.eos_token_id,
-#         top_k=50,
-#         temperature=0.3,
-#         top_p=0.95,
-#         do_sample=True,
-#         repetition_penalty=1.5,
-#         bos_token_id=tokenizer.bos_token_id,
-#         eos_token_id=tokenizer.eos_token_id
-#     )
-    # Decode the generated text
-#     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    # Remove the prompt from the generated text
-#     generated_text = generated_text.replace("### Text ###\n", "").strip()
-    # Tokenize the generated text
-#     tokens = tokenizer.tokenize(generated_text)
-    # Create highlighted text output
-#     highlighted_text = []
-#     for token in tokens:
-        # Remove special tokens and get the token type
-#         clean_token = token.replace("Ġ", "").replace("</w>", "")
-#         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
-#         highlighted_text.append((clean_token, token_type))
-#     return highlighted_text
-# Create Gradio interface
-# iface = gr.Interface(
-#     fn=historical_generation,
-#     inputs=[
-#         gr.Textbox(
-#             label="Prompt",
-#             placeholder="Enter a prompt for historical text generation...",
-#             lines=3
-#         ),
-#         gr.Slider(
-#             label="Max New Tokens",
-#             minimum=50,
-#             maximum=1000,
-#             step=50,
-#             value=600
-#         )
-#     ],
-#     outputs=gr.HighlightedText(
-#         label="Generated Historical Text",
-#         combine_adjacent=True,
-#         show_legend=True
-#     ),
-#     title="Historical Text Generation with OCRonos-Vintage",
-#     description="Generate historical-style text using the OCRonos-Vintage model. The output shows token types as highlights.",
-#     theme=gr.themes.Base()
-# )
-# if __name__ == "__main__":
-  #   iface.launch()

 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import gradio as gr
+import os
+import spacy
+from spacy import displacy
 # Load pre-trained model and tokenizer
 model_name = "PleIAs/OCRonos-Vintage"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+# Load spaCy model for dependency parsing
+os.system('python -m spacy download en_core_web_sm')
+nlp = spacy.load("en_core_web_sm")
+# Function for generating text and tokenizing
 def historical_generation(prompt, max_new_tokens=600):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
     # Create highlighted text output
     highlighted_text = []
     for token in tokens:
+        clean_token = token.replace("Ġ", "")  # Remove "Ġ"
         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
         highlighted_text.append((clean_token, token_type))
+    return highlighted_text, generated_text  # Return both tokenized and raw generated text
+# Function for dependency parsing using spaCy
 def text_analysis(text):
     doc = nlp(text)
     html = displacy.render(doc, style="dep", page=True)
     return pos_tokens, pos_count, html
+# Full interface combining text generation and analysis
 def full_interface(prompt, max_new_tokens):
+    generated_highlight, generated_text = historical_generation(prompt, max_new_tokens)
+    # Dependency parse of both input and generated text
+    tokens_input, pos_count_input, html_input = text_analysis(prompt)
+    tokens_generated, pos_count_generated, html_generated = text_analysis(generated_text)
+    return generated_highlight, pos_count_input, html_input, html_generated
 # Create Gradio interface
 iface = gr.Interface(
             combine_adjacent=True,
             show_legend=True
         ),
+        gr.JSON(label="Tokenizer Info (Input Text)"),
+        gr.HTML(label="Dependency Parse Visualization (Input Text)"),
+        gr.HTML(label="Dependency Parse Visualization (Generated Text)")
     ],
     title="Historical Text Generation with OCRonos-Vintage",
+    description="Generate historical-style text using OCRonos-Vintage and analyze the tokenizer output, including dependency parsing.",
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
+    iface.launch()