Spaces:
Sleeping
Sleeping
File size: 4,022 Bytes
ce54c6a 6db2364 ce54c6a 0a66247 9927ce5 ce54c6a 9927ce5 ce54c6a 0555443 9927ce5 0555443 9927ce5 0a66247 9927ce5 6db2364 9927ce5 af7d479 9927ce5 0555443 0a66247 0555443 ce54c6a f660872 0a66247 8f07179 af7d479 0a66247 8f07179 0a66247 af7d479 0a66247 f660872 af7d479 9927ce5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from transformers import pipeline
import requests
import json
import gradio as gr
js = """
async () => {
function showCard(event, title, content) {
document.getElementById('hovercard').style.visibility = 'visible';
document.getElementById('card_title').innerText = title;
document.getElementById('card_content').innerText = content;
}
function hideCard(event) {
document.getElementById('hovercard').style.visibility = 'hidden';
}
globalThis.showCard = showCard;
globalThis.hideCard = hideCard;
}
"""
def get_matches(text):
pred = pipe(text, max_length=5000)[0]["translation_text"]
def get_mapping(pred):
pred = pred.split(" = ")
pred = [x.split("+") for x in pred]
flat = [x for y in pred for x in y]
flat = [x.split(":") for x in flat]
return flat
mapping = get_mapping(pred)
# only keep tuples with length 2
mapping = [x for x in mapping if len(x) == 2]
matches = []
cur = mapping.pop(0)
i = 0
done = False
while i < len(text) and not done:
if text[i:].startswith(cur[0]):
matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
i += len(cur[0])
if len(mapping) == 0:
done = True
else:
cur = mapping.pop(0)
else:
i += 1
return (text, pred, matches)
pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)
def predict(input):
text, pred, matches = get_matches(input)
matches = {x["start"]: x for x in matches}
output = f"""
<div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""
i = 0
while i < len(text):
if i in matches:
match = matches[i]["lexicon"]
# if match ends with _R, remove _R suffix
if match.endswith("_R"):
match = match[:-2]
if match in data:
output += f"""
<span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
"""
else:
output += matches[i]["match"]
i = matches[i]["end"]
else:
if text[i] == " ":
output += " "
else:
output += text[i]
i += 1
output += "</div>"
output += """
<div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px;
border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
<h3 id='card_title' style='color: #000000;'></h3>
<p id='card_content' style='color: #000000;'></p>
</div>
"""
return output
with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler", js = js) as demo:
gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
with gr.Row():
with gr.Column():
input = gr.Textbox(label="Input", placeholder="Enter Arabic Text", lines=1)
gr.Examples(examples=["بديش اروح معك", "معملتش اشي"], inputs=input)
btn = gr.Button("Analyze")
with gr.Column():
html = gr.HTML()
btn.click(predict, inputs=[input], outputs=[html])
input.submit(predict, inputs=[input], outputs=[html])
demo.load()
if __name__ == "__main__":
demo.launch() |