Spaces:

Tonic
/

OCRonos-TextGen

Sleeping

App Files Files Community

Tonic commited on Sep 9, 2024

Commit

be6c757

verified ·

1 Parent(s): 3582cae

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -11

app.py CHANGED Viewed

@@ -22,23 +22,23 @@ os.system('python -m spacy download en_core_web_sm')
 nlp = spacy.load("en_core_web_sm")
 # Function for generating text and tokenizing
-def historical_generation(prompt, max_new_tokens=600):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
     input_ids = inputs["input_ids"].to(device)
     attention_mask = inputs["attention_mask"].to(device)
-    # Generate text
     output = model.generate(
         input_ids,
         attention_mask=attention_mask,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer.eos_token_id,
-        top_k=50,
-        temperature=0.3,
-        top_p=0.95,
         do_sample=True,
-        repetition_penalty=1.5,
         bos_token_id=tokenizer.bos_token_id,
         eos_token_id=tokenizer.eos_token_id
     )
@@ -53,11 +53,11 @@ def historical_generation(prompt, max_new_tokens=600):
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
-    # Create highlighted text output
     highlighted_text = []
     for token in tokens:
         clean_token = token.replace("Ġ", "")  # Remove "Ġ"
-        token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
         highlighted_text.append((clean_token, token_type))
     return highlighted_text, generated_text  # Return both tokenized and raw generated text
@@ -85,8 +85,10 @@ def generate_dependency_parse(generated_text):
     return html_generated
 # Full interface combining text generation and analysis, split across steps
-def full_interface(prompt, max_new_tokens):
-    generated_highlight, generated_text = historical_generation(prompt, max_new_tokens)
     # Dependency parse of input text
     tokens_input, pos_count_input, html_input = text_analysis(prompt)
@@ -101,7 +103,13 @@ def reset_interface():
 # Gradio interface components
 with gr.Blocks() as iface:
     prompt = gr.Textbox(label="Prompt", placeholder="Enter a prompt for historical text generation...", lines=3)
     max_new_tokens = gr.Slider(label="Max New Tokens", minimum=50, maximum=1000, step=50, value=600)
     # Output components
     highlighted_text = gr.HighlightedText(label="Generated Historical Text", combine_adjacent=True, show_legend=True)
@@ -126,7 +134,7 @@ with gr.Blocks() as iface:
     generate_button = gr.Button(value="Generate Text and Initial Outputs")
     generate_button.click(
         full_interface,
-        inputs=[prompt, max_new_tokens],
         outputs=[highlighted_text, tokenizer_info, dependency_parse_input, send_button, dependency_parse_generated, generate_button, reset_button]
     )

 nlp = spacy.load("en_core_web_sm")
 # Function for generating text and tokenizing
+def historical_generation(prompt, max_new_tokens=600, top_k=50, temperature=0.7, top_p=0.95, repetition_penalty=1.0):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
     input_ids = inputs["input_ids"].to(device)
     attention_mask = inputs["attention_mask"].to(device)
+    # Generate text with customizable parameters
     output = model.generate(
         input_ids,
         attention_mask=attention_mask,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer.eos_token_id,
+        top_k=top_k,
+        temperature=temperature,
+        top_p=top_p,
         do_sample=True,
+        repetition_penalty=repetition_penalty,
         bos_token_id=tokenizer.bos_token_id,
         eos_token_id=tokenizer.eos_token_id
     )
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
+    # Create highlighted text output, remove "Ġ" from both the token and token_type
     highlighted_text = []
     for token in tokens:
         clean_token = token.replace("Ġ", "")  # Remove "Ġ"
+        token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0].replace("Ġ", "")
         highlighted_text.append((clean_token, token_type))
     return highlighted_text, generated_text  # Return both tokenized and raw generated text
     return html_generated
 # Full interface combining text generation and analysis, split across steps
+def full_interface(prompt, max_new_tokens, top_k, temperature, top_p, repetition_penalty):
+    generated_highlight, generated_text = historical_generation(
+        prompt, max_new_tokens, top_k, temperature, top_p, repetition_penalty
+    )
     # Dependency parse of input text
     tokens_input, pos_count_input, html_input = text_analysis(prompt)
 # Gradio interface components
 with gr.Blocks() as iface:
     prompt = gr.Textbox(label="Prompt", placeholder="Enter a prompt for historical text generation...", lines=3)
+    # Slider for model parameters
     max_new_tokens = gr.Slider(label="Max New Tokens", minimum=50, maximum=1000, step=50, value=600)
+    top_k = gr.Slider(label="Top-k Sampling", minimum=1, maximum=100, step=1, value=50)
+    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, step=0.1, value=0.7)
+    top_p = gr.Slider(label="Top-p (Nucleus Sampling)", minimum=0.1, maximum=1.0, step=0.05, value=0.95)
+    repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
     # Output components
     highlighted_text = gr.HighlightedText(label="Generated Historical Text", combine_adjacent=True, show_legend=True)
     generate_button = gr.Button(value="Generate Text and Initial Outputs")
     generate_button.click(
         full_interface,
+        inputs=[prompt, max_new_tokens, top_k, temperature, top_p, repetition_penalty],
         outputs=[highlighted_text, tokenizer_info, dependency_parse_input, send_button, dependency_parse_generated, generate_button, reset_button]
     )