Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,42 +3,29 @@ import re
|
|
3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
4 |
import torch
|
5 |
import gradio as gr
|
6 |
-
import
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import os
|
9 |
|
10 |
-
# OCR
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
model_name = "PleIAs/OCRonos-Vintage"
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
-
|
14 |
-
# Load pre-trained model and tokenizer
|
15 |
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
|
16 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
font-size: 1.2em;
|
25 |
-
}
|
26 |
-
.inserted {
|
27 |
-
background-color: #90EE90;
|
28 |
-
}
|
29 |
-
</style>
|
30 |
-
"""
|
31 |
-
|
32 |
-
def generate_html_diff(old_text, new_text):
|
33 |
-
d = difflib.Differ()
|
34 |
-
diff = list(d.compare(old_text.split(), new_text.split()))
|
35 |
-
html_diff = []
|
36 |
-
for word in diff:
|
37 |
-
if word.startswith(' '):
|
38 |
-
html_diff.append(word[2:])
|
39 |
-
elif word.startswith('+ '):
|
40 |
-
html_diff.append(f'<span class="inserted">{word[2:]}</span>')
|
41 |
-
return ' '.join(html_diff)
|
42 |
|
43 |
def split_text(text, max_tokens=400):
|
44 |
tokens = tokenizer.tokenize(text)
|
@@ -86,18 +73,22 @@ def process_text(user_message):
|
|
86 |
corrected_chunks.append(corrected_chunk)
|
87 |
|
88 |
corrected_text = ' '.join(corrected_chunks)
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
101 |
process_button.click(process_text, inputs=text_input, outputs=[text_output])
|
102 |
|
103 |
if __name__ == "__main__":
|
|
|
3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
4 |
import torch
|
5 |
import gradio as gr
|
6 |
+
from difflib import Differ
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import os
|
9 |
|
10 |
+
description = """# 🙋🏻♂️Welcome to Tonic's On-Device📲⌚🎅🏻OCR Corrector (CPU)
|
11 |
+
📲⌚🎅🏻OCRonos-Vintage is a small specialized model for OCR correction of cultural heritage archives pre-trained with llm.c. OCRonos-Vintage is only 124 million parameters. It can run easily on CPU or provide correction at scale on GPUs (>10k tokens/seconds) while providing a quality of correction comparable to GPT-4 or the llama version of OCRonos for English-speaking cultural archives.
|
12 |
+
|
13 |
+
### Join us :
|
14 |
+
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
15 |
+
"""
|
16 |
+
|
17 |
model_name = "PleIAs/OCRonos-Vintage"
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
+
🙋🏻♂️Welcome to Tonic's ⌚🎅🏻Vintage OCRonos Corrector (CPU)
|
|
|
20 |
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
|
21 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
22 |
|
23 |
+
def diff_texts(text1, text2):
|
24 |
+
d = Differ()
|
25 |
+
return [
|
26 |
+
(token[2:], token[0] if token[0] != " " else None)
|
27 |
+
for token in d.compare(text1.split(), text2.split())
|
28 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def split_text(text, max_tokens=400):
|
31 |
tokens = tokenizer.tokenize(text)
|
|
|
73 |
corrected_chunks.append(corrected_chunk)
|
74 |
|
75 |
corrected_text = ' '.join(corrected_chunks)
|
76 |
+
return diff_texts(user_message, corrected_text)
|
77 |
+
|
78 |
+
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
79 |
+
gr.MarkDown(description)
|
80 |
+
text_input = gr.Textbox(
|
81 |
+
label="↘️Enter 👁️OCR'ed Text Outputs Here",
|
82 |
+
info="""Hi there, ;fémy name à`gis tonic 45and i like to ride my vpotz""",
|
83 |
+
lines=5,
|
84 |
+
)
|
85 |
+
process_button = gr.Button("Correct using 📲⌚🎅🏻OCRonos")
|
86 |
+
text_output = gr.HighlightedText(
|
87 |
+
label="📲⌚🎅🏻OCRonos Correction:",
|
88 |
+
combine_adjacent=True,
|
89 |
+
show_legend=True,
|
90 |
+
color_map={"+": "green", "-": "red"}
|
91 |
+
)
|
92 |
process_button.click(process_text, inputs=text_input, outputs=[text_output])
|
93 |
|
94 |
if __name__ == "__main__":
|