Spaces:
Sleeping
Sleeping
from transformers import pipeline | |
import requests | |
import json | |
import gradio as gr | |
js = """ | |
async () => { | |
function showCard(event, title, content) { | |
document.getElementById('hovercard').style.visibility = 'visible'; | |
document.getElementById('card_title').innerText = title; | |
document.getElementById('card_content').innerText = content; | |
} | |
function hideCard(event) { | |
document.getElementById('hovercard').style.visibility = 'hidden'; | |
} | |
globalThis.showCard = showCard; | |
globalThis.hideCard = hideCard; | |
} | |
""" | |
def get_matches(text): | |
pred = pipe(text, max_length=5000)[0]["translation_text"] | |
def get_mapping(pred): | |
pred = pred.split(" = ") | |
pred = [x.split("+") for x in pred] | |
flat = [x for y in pred for x in y] | |
flat = [x.split(":") for x in flat] | |
return flat | |
mapping = get_mapping(pred) | |
# only keep tuples with length 2 | |
mapping = [x for x in mapping if len(x) == 2] | |
matches = [] | |
cur = mapping.pop(0) | |
i = 0 | |
done = False | |
while i < len(text) and not done: | |
if text[i:].startswith(cur[0]): | |
matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]}) | |
i += len(cur[0]) | |
if len(mapping) == 0: | |
done = True | |
else: | |
cur = mapping.pop(0) | |
else: | |
i += 1 | |
return (text, pred, matches) | |
pipe = pipeline("translation", "guymorlan/TokenizerLabeller") | |
r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json") | |
data = json.loads(r.text) | |
def predict(input): | |
text, pred, matches = get_matches(input) | |
matches = {x["start"]: x for x in matches} | |
output = f""" | |
<div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>""" | |
i = 0 | |
while i < len(text): | |
if i in matches: | |
match = matches[i]["lexicon"] | |
# if match ends with _R, remove _R suffix | |
if match.endswith("_R"): | |
match = match[:-2] | |
if match in data: | |
output += f""" | |
<span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;' | |
onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")' | |
onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span> | |
""" | |
else: | |
output += matches[i]["match"] | |
i = matches[i]["end"] | |
else: | |
if text[i] == " ": | |
output += " " | |
else: | |
output += text[i] | |
i += 1 | |
output += "</div>" | |
output += """ | |
<div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px; | |
border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'> | |
<h3 id='card_title' style='color: #000000;'></h3> | |
<p id='card_content' style='color: #000000;'></p> | |
</div> | |
""" | |
return output | |
with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler", js = js) as demo: | |
gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator") | |
with gr.Row(): | |
with gr.Column(): | |
input = gr.Textbox(label="Input", placeholder="Enter Arabic Text", lines=1) | |
gr.Examples(examples=["بديش اروح معك", "معملتش اشي"], inputs=input) | |
btn = gr.Button("Analyze") | |
with gr.Column(): | |
html = gr.HTML() | |
btn.click(predict, inputs=[input], outputs=[html]) | |
input.submit(predict, inputs=[input], outputs=[html]) | |
demo.load() | |
if __name__ == "__main__": | |
demo.launch() |