from transformers import pipeline import requests import json import gradio as gr js = """ async () => { function showCard(event, title, content) { document.getElementById('hovercard').style.visibility = 'visible'; document.getElementById('card_title').innerText = title; document.getElementById('card_content').innerText = content; } function hideCard(event) { document.getElementById('hovercard').style.visibility = 'hidden'; } globalThis.showCard = showCard; globalThis.hideCard = hideCard; } """ def get_matches(text): pred = pipe(text, max_length=5000)[0]["translation_text"] def get_mapping(pred): pred = pred.split(" = ") pred = [x.split("+") for x in pred] flat = [x for y in pred for x in y] flat = [x.split(":") for x in flat] return flat mapping = get_mapping(pred) # only keep tuples with length 2 mapping = [x for x in mapping if len(x) == 2] matches = [] cur = mapping.pop(0) i = 0 done = False while i < len(text) and not done: if text[i:].startswith(cur[0]): matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]}) i += len(cur[0]) if len(mapping) == 0: done = True else: cur = mapping.pop(0) else: i += 1 return (text, pred, matches) pipe = pipeline("translation", "guymorlan/TokenizerLabeller") r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json") data = json.loads(r.text) def predict(input): text, pred, matches = get_matches(input) matches = {x["start"]: x for x in matches} output = f"""