Spaces:

linhd-postdata
/

alberti

Sleeping

App Files Files Community

versae commited on Jul 10, 2023

Commit

34795be

•

1 Parent(s): e112560

Create app.py

Browse files

Files changed (1) hide show

app.py +202 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import random
+import re
+from poems import SAMPLE_POEMS
+import langid
+import numpy as np
+import streamlit as st
+import torch
+from icu_tokenizer import Tokenizer
+from transformers import pipeline
+MODELS = {
+    "ALBERTI": "linhd-postdata/alberti-bert-base-multilingual-cased",
+    "mBERT": "bert-base-multilingual-cased"
+}
+TOPK = 50
+st.set_page_config(layout="wide")
+def mask_line(line, language="es", restrictive=True):
+    tokenizer = Tokenizer(lang=language)
+    token_list = tokenizer.tokenize(line)
+    if lang != "zh":
+        restrictive = not all([len(token) <= 3 for token in token_list])
+    random_num = random.randint(0, len(token_list) - 1)
+    random_word = token_list[random_num]
+    if not restrictive:
+        token_list[random_num] = "[MASK]"
+        masked_l = " ".join(token_list)
+        return masked_l
+    elif len(random_word) > 3 or (lang == "zh" and random_word.isalpha()):
+        token_list[random_num] = "[MASK]"
+        masked_l = " ".join(token_list)
+        return masked_l
+    else:
+        return mask_line(line, language)
+def filter_candidates(candidates, get_any_candidate=False):
+    cand_list = []
+    score_list = []
+    for candidate in candidates:
+        if not get_any_candidate and candidate["token_str"][:2] != "##" and candidate["token_str"].isalpha():
+            cand = candidate["sequence"]
+            score = candidate["score"]
+            cand_list.append(cand)
+            score_list.append('{0:.5f}'.format(score))
+        elif get_any_candidate:
+            cand = candidate["sequence"]
+            score = candidate["score"]
+            cand_list.append(cand)
+            score_list.append('{0:.5f}'.format(score))
+        if len(score_list) == TOPK:
+            break
+    if len(cand_list) < 1:
+        return filter_candidates(candidates, get_any_candidate=True)
+    else:
+        return cand_list[0]
+def infer_candidates(nlp, line):
+    line = re.sub("–", "-", line)
+    line = re.sub("—", "-", line)
+    line = re.sub("’", "'", line)
+    line = re.sub("…", "...", line)
+    inputs = nlp._parse_and_tokenize(line)
+    outputs = nlp._forward(inputs, return_tensors=True)
+    input_ids = inputs["input_ids"][0]
+    masked_index = torch.nonzero(input_ids == nlp.tokenizer.mask_token_id,
+                                 as_tuple=False)
+    logits = outputs[0, masked_index.item(), :]
+    probs = logits.softmax(dim=0)
+    values, predictions = probs.topk(TOPK)
+    result = []
+    for v, p in zip(values.tolist(), predictions.tolist()):
+        tokens = input_ids.numpy()
+        tokens[masked_index] = p
+        # Filter padding out:
+        tokens = tokens[np.where(tokens != nlp.tokenizer.pad_token_id)]
+        l = []
+        token_list = [nlp.tokenizer.decode([token], skip_special_tokens=True) for token in tokens]
+        for idx, token in enumerate(token_list):
+            if token.startswith('##'):
+                l[-1] += token[2:]
+            elif idx == masked_index.item():
+                l += ['<b style="color: #ff0000;">', token, "</b>"]
+            else:
+                l += [token]
+        sequence = " ".join(l).strip()
+        result.append(
+            {
+                "sequence": sequence,
+                "score": v,
+                "token": p,
+                "token_str": nlp.tokenizer.decode(p),
+                "masked_index": masked_index.item()
+            }
+        )
+    return result
+def rewrite_poem(poem, ml_model=MODELS["ALBERTI"], masking=True, language="es"):
+    nlp = pipeline("fill-mask", model=ml_model)
+    unmasked_lines = []
+    masked_lines = []
+    for line in poem:
+        if line == "":
+            unmasked_lines.append("")
+            masked_lines.append("")
+            continue
+        if masking:
+            masked_line = mask_line(line, language)
+        else:
+            masked_line = line
+        masked_lines.append(masked_line)
+        unmasked_line_candidates = infer_candidates(nlp, masked_line)
+        unmasked_line = filter_candidates(unmasked_line_candidates)
+        unmasked_lines.append(unmasked_line)
+        unmasked_poem = "<br>".join(unmasked_lines)
+    return unmasked_poem, masked_lines
+instructions_text_0 = st.sidebar.markdown(
+    """# ALBERTI vs BERT 🥊
+We present ALBERTI, our BERT-based multilingual model for poetry.""")
+instructions_text_1 = st.sidebar.markdown(
+    """We have trained bert on a huge (for poetry, that is) corpus of
+multilingual poetry to try to get a more 'poetic' model. This is the result
+of our work.
+You can find more information on the [project's site](https://huggingface.co/flax-community/alberti-bert-base-multilingual-cased)""")
+sample_chooser = st.sidebar.selectbox(
+    "Choose a poem",
+    list(SAMPLE_POEMS.keys())
+)
+instructions_text_2 = st.sidebar.markdown("""# How to use
+You can choose from a list of example poems in Spanish, English, French, German,
+Chinese and Arabic, but you can also paste a poem, or write it yourself!
+Then click on 'Rewrite!' to do the masking and the fill-mask task on the chosen
+poem, randomly masking one word per verse, and get the two new versions for each of the models.
+The list of languages used on the training of ALBERTI are:
+* Arabic
+* Chinese
+* Czech
+* English
+* Finnish
+* French
+* German
+* Hungarian
+* Italian
+* Portuguese
+* Russian
+* Spanish""")
+col1, col2, col3 = st.columns(3)
+st.markdown(
+    """
+    <style>
+        label {
+        font-size: 1rem !important;
+        font-weight: bold !important;
+        }
+        .block-container {
+        padding-left: 1rem !important;
+        padding-right: 1rem !important;
+        }
+    </style>
+    """, unsafe_allow_html=True)
+if sample_chooser:
+    model_list = set(MODELS.values())
+    user_input = col1.text_area("Input poem",
+                                "\n".join(SAMPLE_POEMS[sample_chooser]),
+                                height=600)
+    poem = user_input.split("\n")
+    rewrite_button = col1.button("Rewrite!")
+    if "[MASK]" in user_input or "<mask>" in user_input:
+        col1.error("You don't have to mask the poem, we'll do it for you!")
+if rewrite_button:
+    lang = langid.classify(user_input)[0]
+    unmasked_poem, masked_poem = rewrite_poem(poem, language=lang)
+    user_input_2 = col2.write(f"""<b>Output poem from ALBERTI</b>
+{unmasked_poem}""", unsafe_allow_html=True)
+    unmasked_poem_2, _ = rewrite_poem(masked_poem, ml_model=MODELS["mBERT"],
+                                      masking=False)
+    user_input_3 = col3.write(f"""<b>Output poem from mBERT</b>
+{unmasked_poem_2}""", unsafe_allow_html=True)