OCRonos-TextCorrect

Running

App Files Files Community

Tonic commited on Sep 10

Commit

9626102

•

1 Parent(s): ba05a34

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -40

app.py CHANGED Viewed

@@ -3,42 +3,29 @@ import re
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import torch
 import gradio as gr
-import difflib
 from concurrent.futures import ThreadPoolExecutor
 import os
-# OCR Correction Model
 model_name = "PleIAs/OCRonos-Vintage"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load pre-trained model and tokenizer
 model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-# CSS for formatting
-css = """
-<style>
-.generation {
-    margin-left: 2em;
-    margin-right: 2em;
-    font-size: 1.2em;
-}
-.inserted {
-    background-color: #90EE90;
-}
-</style>
-"""
-def generate_html_diff(old_text, new_text):
-    d = difflib.Differ()
-    diff = list(d.compare(old_text.split(), new_text.split()))
-    html_diff = []
-    for word in diff:
-        if word.startswith(' '):
-            html_diff.append(word[2:])
-        elif word.startswith('+ '):
-            html_diff.append(f'<span class="inserted">{word[2:]}</span>')
-    return ' '.join(html_diff)
 def split_text(text, max_tokens=400):
     tokens = tokenizer.tokenize(text)
@@ -86,18 +73,22 @@ def process_text(user_message):
         corrected_chunks.append(corrected_chunk)
     corrected_text = ' '.join(corrected_chunks)
-    html_diff = generate_html_diff(user_message, corrected_text)
-    ocr_result = f'<h2 style="text-align:center">OCR Correction</h2>\n<div class="generation">{html_diff}</div>'
-    final_output = f"{css}{ocr_result}"
-    return final_output
-# Define the Gradio interface
-with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
-    gr.HTML("""<h1 style="text-align:center">Vintage OCR corrector (CPU)</h1>""")
-    text_input = gr.Textbox(label="Your (bad?) text", type="text", lines=5)
-    process_button = gr.Button("Process Text")
-    text_output = gr.HTML(label="Processed text")
     process_button.click(process_text, inputs=text_input, outputs=[text_output])
 if __name__ == "__main__":

 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import torch
 import gradio as gr
+from difflib import Differ
 from concurrent.futures import ThreadPoolExecutor
 import os
+description = """# 🙋🏻‍♂️Welcome to Tonic's On-Device📲⌚🎅🏻OCR Corrector (CPU)
+    📲⌚🎅🏻OCRonos-Vintage is a small specialized model for OCR correction of cultural heritage archives pre-trained with llm.c. OCRonos-Vintage is only 124 million parameters. It can run easily on CPU or provide correction at scale on GPUs (>10k tokens/seconds) while providing a quality of correction comparable to GPT-4 or the llama version of OCRonos for English-speaking cultural archives.
+    ### Join us :
+    🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
+    """
 model_name = "PleIAs/OCRonos-Vintage"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+🙋🏻‍♂️Welcome to Tonic's ⌚🎅🏻Vintage OCRonos Corrector (CPU)
 model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+def diff_texts(text1, text2):
+    d = Differ()
+    return [
+        (token[2:], token[0] if token[0] != " " else None)
+        for token in d.compare(text1.split(), text2.split())
+    ]
 def split_text(text, max_tokens=400):
     tokens = tokenizer.tokenize(text)
         corrected_chunks.append(corrected_chunk)
     corrected_text = ' '.join(corrected_chunks)
+    return diff_texts(user_message, corrected_text)
+with gr.Blocks(theme=gr.themes.Base()) as demo:
+    gr.MarkDown(description)
+    text_input = gr.Textbox(
+        label="↘️Enter 👁️OCR'ed Text Outputs Here",
+        info="""Hi there, ;fémy name à`gis tonic 45and i like to ride my vpotz""",
+        lines=5,
+    )
+    process_button = gr.Button("Correct using 📲⌚🎅🏻OCRonos")
+    text_output = gr.HighlightedText(
+        label="📲⌚🎅🏻OCRonos Correction:",
+        combine_adjacent=True,
+        show_legend=True,
+        color_map={"+": "green", "-": "red"}
+    )
     process_button.click(process_text, inputs=text_input, outputs=[text_output])
 if __name__ == "__main__":