alvp commited on
Commit
f78244e
·
1 Parent(s): db97ce2

Better UI and model comparison

Browse files
Files changed (3) hide show
  1. app.py +165 -16
  2. poems.py +173 -0
  3. requirements.txt +3 -2
app.py CHANGED
@@ -1,33 +1,182 @@
1
- from transformers import pipeline
 
 
 
 
 
2
  import streamlit as st
3
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
- def filter_candidates(candidates):
8
- df = pd.DataFrame(columns=["Candidates", "Probability"])
9
  cand_list = []
10
  score_list = []
11
  for candidate in candidates:
12
- if candidate["token_str"][:2] != "##":
13
  cand = candidate["sequence"]
14
  score = candidate["score"]
15
  cand_list.append(cand)
16
  score_list.append('{0:.5f}'.format(score))
17
- if len(score_list) == 5:
 
 
 
 
 
18
  break
19
- df["Candidates"] = cand_list
20
- df["Probability"] = score_list
21
-
22
- df.index = [1,2,3,4,5]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- return df
 
 
 
 
 
 
 
 
25
 
26
- nlp = pipeline("fill-mask", model="flax-community/alberti-bert-base-multilingual-cased")
 
 
 
27
 
28
- user_input = st.text_input("Mask token: [MASK]", "Me encanta escribir [MASK].")
29
 
30
- if st.button("Guess!"):
31
- results = filter_candidates(nlp(user_input, top_k=20))
32
- st.table(results)
 
33
 
 
 
1
+ import random
2
+ import re
3
+ from poems import SAMPLE_POEMS
4
+
5
+ import langid
6
+ import numpy as np
7
  import streamlit as st
8
+ import torch
9
+
10
+ from icu_tokenizer import Tokenizer
11
+ from transformers import pipeline
12
+
13
+ MODELS = {
14
+ "ALBERTI": "flax-community/alberti-bert-base-multilingual-cased",
15
+ "mBERT": "bert-base-multilingual-cased"
16
+ }
17
+
18
+ TOPK = 50
19
+ st.set_page_config(layout="wide")
20
+
21
 
22
+ def mask_line(line, language="es", restrictive=True):
23
+ tokenizer = Tokenizer(lang=language)
24
+ token_list = tokenizer.tokenize(line)
25
+ if lang != "zh":
26
+ restrictive = not all([len(token) <= 3 for token in token_list])
27
+ random_num = random.randint(0, len(token_list) - 1)
28
+ random_word = token_list[random_num]
29
+ if not restrictive:
30
+ token_list[random_num] = "[MASK]"
31
+ masked_l = " ".join(token_list)
32
+ return masked_l
33
+ elif len(random_word) > 3 or (lang == "zh" and random_word.isalpha()):
34
+ token_list[random_num] = "[MASK]"
35
+ masked_l = " ".join(token_list)
36
+ return masked_l
37
+ else:
38
+ return mask_line(line, language)
39
 
40
 
41
+ def filter_candidates(candidates, get_any_candidate=False):
 
42
  cand_list = []
43
  score_list = []
44
  for candidate in candidates:
45
+ if not get_any_candidate and candidate["token_str"][:2] != "##" and candidate["token_str"].isalpha():
46
  cand = candidate["sequence"]
47
  score = candidate["score"]
48
  cand_list.append(cand)
49
  score_list.append('{0:.5f}'.format(score))
50
+ elif get_any_candidate:
51
+ cand = candidate["sequence"]
52
+ score = candidate["score"]
53
+ cand_list.append(cand)
54
+ score_list.append('{0:.5f}'.format(score))
55
+ if len(score_list) == TOPK:
56
  break
57
+ if len(cand_list) < 1:
58
+ return filter_candidates(candidates, get_any_candidate=True)
59
+ else:
60
+ return cand_list[0]
61
+
62
+
63
+ def infer_candidates(nlp, line):
64
+ line = re.sub("’", "'", line)
65
+ line = re.sub("…", "...", line)
66
+ inputs = nlp._parse_and_tokenize(line)
67
+ outputs = nlp._forward(inputs, return_tensors=True)
68
+ input_ids = inputs["input_ids"][0]
69
+ masked_index = torch.nonzero(input_ids == nlp.tokenizer.mask_token_id,
70
+ as_tuple=False)
71
+ logits = outputs[0, masked_index.item(), :]
72
+ probs = logits.softmax(dim=0)
73
+ values, predictions = probs.topk(TOPK)
74
+ result = []
75
+ for v, p in zip(values.tolist(), predictions.tolist()):
76
+ tokens = input_ids.numpy()
77
+ tokens[masked_index] = p
78
+ # Filter padding out:
79
+ tokens = tokens[np.where(tokens != nlp.tokenizer.pad_token_id)]
80
+ l = []
81
+ token_list = [nlp.tokenizer.decode([token], skip_special_tokens=True) for token in tokens]
82
+ print(token_list)
83
+ for idx, token in enumerate(token_list):
84
+ if token.startswith('##'):
85
+ l[-1] += token[2:]
86
+ elif idx == masked_index.item():
87
+ l += ['<b style="color: #ff0000;">', token, "</b>"]
88
+ else:
89
+ l += [token]
90
+ sequence = " ".join(l).strip()
91
+ result.append(
92
+ {
93
+ "sequence": sequence,
94
+ "score": v,
95
+ "token": p,
96
+ "token_str": nlp.tokenizer.decode(p),
97
+ "masked_index": masked_index.item()
98
+ }
99
+ )
100
+ return result
101
+
102
+
103
+ def rewrite_poem(poem, ml_model=MODELS["ALBERTI"], masking=True, language="es"):
104
+ nlp = pipeline("fill-mask", model=ml_model)
105
+ unmasked_lines = []
106
+ masked_lines = []
107
+ for line in poem:
108
+ if line == "":
109
+ unmasked_lines.append("")
110
+ masked_lines.append("")
111
+ continue
112
+ if masking:
113
+ masked_line = mask_line(line, language)
114
+ else:
115
+ masked_line = line
116
+ masked_lines.append(masked_line)
117
+ unmasked_line_candidates = infer_candidates(nlp, masked_line)
118
+ unmasked_line = filter_candidates(unmasked_line_candidates)
119
+ unmasked_lines.append(unmasked_line)
120
+ unmasked_poem = "<br>".join(unmasked_lines)
121
+ return unmasked_poem, masked_lines
122
+
123
+
124
+ instructions_text_0 = st.sidebar.markdown(
125
+ """# ALBERTI vs BERT 🥊
126
+
127
+ We present ALBERTI, our BERT-based multilingual model for poetry.""")
128
+
129
+ instructions_text_1 = st.sidebar.markdown(
130
+ """We have trained bert on a huge (for poetry, that is) corpus of
131
+ multilingual poetry to try to get a more 'poetic' model. This is the result
132
+ of our work.
133
+
134
+ You can find more information on the [project's site](https://huggingface.co/flax-community/alberti-bert-base-multilingual-cased)""")
135
+
136
+ sample_chooser = st.sidebar.selectbox(
137
+ "Choose a poem",
138
+ (SAMPLE_POEMS.keys())
139
+ )
140
+
141
+ instructions_text_2 = st.sidebar.markdown("""# How to use
142
+
143
+ You can choose from a list of example poems in Spanish, English, French, German,
144
+ Chinese and Arabic, but you can also paste a poem o write it yourself!
145
+
146
+ Then click on 'Rewrite!' to do the masking and the fill-mask task on the chosen
147
+ poem.""")
148
+
149
+ col1, col2, col3 = st.beta_columns(3)
150
+
151
+ st.markdown(
152
+ """
153
+ <style>
154
+ label {
155
+ font-size: 1rem !important;
156
+ font-weight: bold !important;
157
+ }
158
+ </style>
159
+ """, unsafe_allow_html=True)
160
 
161
+ if sample_chooser:
162
+ model_list = set(MODELS.values())
163
+ user_input = col1.text_area("Input poem",
164
+ "\n".join(SAMPLE_POEMS[sample_chooser]),
165
+ height=600)
166
+ poem = user_input.split("\n")
167
+ rewrite_button = col1.button("Rewrite!")
168
+ if "[MASK]" in user_input or "<mask>" in user_input:
169
+ col1.error("You don't have to mask the poem, we'll do it for you!")
170
 
171
+ if rewrite_button:
172
+ lang = langid.classify(user_input)[0]
173
+ unmasked_poem, masked_poem = rewrite_poem(poem, language=lang)
174
+ user_input_2 = col2.write(f"""<b>Output poem from ALBERTI</b>
175
 
 
176
 
177
+ {unmasked_poem}""", unsafe_allow_html=True)
178
+ unmasked_poem_2, _ = rewrite_poem(masked_poem, ml_model=MODELS["mBERT"],
179
+ masking=False)
180
+ user_input_3 = col3.write(f"""<b>Output poem from mBERT</b>
181
 
182
+ {unmasked_poem_2}""", unsafe_allow_html=True)
poems.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SAMPLE_POEMS = {
2
+ "es_1": [
3
+ "A través del follaje perenne",
4
+ "Que oír deja rumores extraños,",
5
+ "Y entre un mar de ondulante verdura,",
6
+ "Amorosa mansión de los pájaros,",
7
+ "Desde mis ventanas veo",
8
+ "El templo que quise tanto.",
9
+ "",
10
+ "El templo que tanto quise...",
11
+ "Pues no sé decir ya si le quiero,",
12
+ "Que en el rudo vaivén que sin tregua",
13
+ "Se agitan mis pensamientos,",
14
+ "Dudo si el rencor adusto",
15
+ "Vive unido al amor en mi pecho."],
16
+ "es_2": [
17
+ "Es hielo abrasador, es fuego helado,",
18
+ "es herida que duele y no se siente,",
19
+ "es un soñado bien, un mal presente,",
20
+ "es un breve descanso muy cansado.",
21
+ "",
22
+ "Es un descuido que nos da cuidado,",
23
+ "un cobarde con nombre de valiente,",
24
+ "un andar solitario entre la gente,",
25
+ "un amar solamente ser amado.",
26
+ "",
27
+ "Es una libertad encarcelada,",
28
+ "que dura hasta el postrero paroxismo;",
29
+ "enfermedad que crece si es curada.",
30
+ "Éste es el niño Amor, éste es su abismo.",
31
+ "¿Mirad cuál amistad tendrá con nada",
32
+ "el que en todo es contrario de sí mismo!"],
33
+ "en_1": [
34
+ "Two roads diverged in a yellow wood,",
35
+ "And sorry I could not travel both",
36
+ "And be one traveler, long I stood",
37
+ "And looked down one as far as I could",
38
+ "To where it bent in the undergrowth;",
39
+ "",
40
+ "Then took the other, as just as fair,",
41
+ "And having perhaps the better claim,",
42
+ "Because it was grassy and wanted wear;",
43
+ "Though as for that the passing there",
44
+ "Had worn them really about the same,",
45
+ "",
46
+ "And both that morning equally lay",
47
+ "In leaves no step had trodden black.",
48
+ "Oh, I kept the first for another day!",
49
+ "Yet knowing how way leads on to way,",
50
+ "I doubted if I should ever come back.",
51
+ "",
52
+ "I shall be telling this with a sigh",
53
+ "Somewhere ages and ages hence:",
54
+ "Two roads diverged in a wood, and I—",
55
+ "I took the one less traveled by,",
56
+ "And that has made all the difference."],
57
+ "en_2": [
58
+ "April is the cruellest month, breeding",
59
+ "Lilacs out of the dead land, mixing",
60
+ "Memory and desire, stirring",
61
+ "Dull roots with spring rain.",
62
+ "Winter kept us warm, covering",
63
+ "Earth in forgetful snow, feeding",
64
+ "A little life with dried tubers.",
65
+ "Summer surprised us, coming over the Starnbergersee",
66
+ "With a shower of rain; we stopped in the colonnade,",
67
+ "And went on in sunlight, into the Hofgarten,",
68
+ "And drank coffee, and talked for an hour.",
69
+ "Bin gar keine Russin, stamm' aus Litauen, echt deutsch.",
70
+ "And when we were children, staying at the arch-duke's,",
71
+ "My cousin's, he took me out on a sled,",
72
+ "And I was frightened. He said, Marie,",
73
+ "Marie, hold on tight. And down we went.",
74
+ "In the mountains, there you feel free.",
75
+ "I read, much of the night, and go south in the winter."],
76
+ "fr_1": [
77
+ "Demain, dès l'aube, à l'heure où blanchit la campagne,",
78
+ "Je partirai. Vois-tu, je sais que tu m'attends.",
79
+ "J'irai par la forêt, j'irai par la montagne.",
80
+ "Je ne puis demeurer loin de toi plus longtemps.",
81
+ "",
82
+ "Je marcherai les yeux fixés sur mes pensées,",
83
+ "Sans rien voir au dehors, sans entendre aucun bruit,",
84
+ "Seul, inconnu, le dos courbé, les mains croisées,",
85
+ "Triste, et le jour pour moi sera comme la nuit.",
86
+ "",
87
+ "Je ne regarderai ni l'or du soir qui tombe,",
88
+ "Ni les voiles au loin descendant vers Harfleur,",
89
+ "Et quand j'arriverai, je mettrai sur ta tombe",
90
+ "Un bouquet de houx vert et de bruyère en fleur."],
91
+ "fr_2": [
92
+ "Cheminement de tous les clochers",
93
+ "sur le ciel",
94
+ "guet-apens très doux",
95
+ "des aéroplanes",
96
+ "sur ton cœur",
97
+ "comme les hirondelles",
98
+ "que tu apprivoises",
99
+ "avec ton ombre",
100
+ "",
101
+ "Tu peux t'éloigner",
102
+ "dans la magie",
103
+ "des fleurs nocturnes",
104
+ "tu peux prendre la tempête",
105
+ "pour amie",
106
+ "je serai ce lac de brume",
107
+ "à ton arrivée",
108
+ "ce lac de brume",
109
+ "et tu diras que tu aimes",
110
+ "toutes les lumières",
111
+ "de la ville."],
112
+ "de_1": [
113
+ "Der du von dem Himmel bist,",
114
+ "Alles Leid und Schmerzen stillest,",
115
+ "Den, der doppelt elend ist,",
116
+ "Doppelt mit Erquickung füllest;",
117
+ "Ach, ich bin des Treibens müde!",
118
+ "Was soll all der Schmerz und Lust?",
119
+ "Süßer Friede,",
120
+ "Komm, ach komm in meine Brust!"],
121
+ "de_2": [
122
+ "Wieder duftet der Wald. ",
123
+ "Es heben die schwebenden Lerchen",
124
+ "mit sich den Himmel empor, der unseren Schultern schwer war; ",
125
+ "zwar sah man noch durch die Äste den Tag, wie er leer war,- ",
126
+ "aber nach langen, regnenden Nachmittagen ",
127
+ "kommen die goldübersonnten ",
128
+ "neueren Stunden, ",
129
+ "vor denen flüchtend an fernen Häuserfronten ",
130
+ "alle die wunden Fenster furchtsam mit Flügeln schlagen. ",
131
+ "Dann wird es still. Sogar der Regen geht leiser",
132
+ "über der Steine ruhig dunkelnden Glanz.",
133
+ "Alle Geräusche ducken sich ganz",
134
+ "in die glänzenden Knospen der Reiser."],
135
+ "zh_1": [
136
+ "春眠不觉晓,",
137
+ "处处闻啼鸟。",
138
+ "",
139
+ "夜来风雨声,",
140
+ "花落知多少"],
141
+ "zh_2": [
142
+ "关关雎鸠,在河之洲。",
143
+ "窈窕淑女,君子好逑。",
144
+ "",
145
+ "参差荇菜,左右流之。",
146
+ "窈窕淑女,寤寐求之。",
147
+ "",
148
+ "求之不得,寤寐思服。",
149
+ "悠哉悠哉,辗转反侧。",
150
+ "",
151
+ "参差荇菜,左右采之。",
152
+ "窈窕淑女,琴瑟友之。",
153
+ "",
154
+ "参差荇菜,左右毛之。",
155
+ "窈窕淑女,钟鼓乐之。"],
156
+ "ar_1": [
157
+ "داب نعشق لأليمه نجيمه",
158
+ "من يحبك ويموت فيك",
159
+ "إن قتلت عاد يكون بيك",
160
+ "لو قدر قلبي يخليك",
161
+ "لم يدبّر ذا النُّغيمة",
162
+ "يا مطرنَنِ شِلِباطُ (يا مذهول)",
163
+ "تُن حزين تنِ بناطُ (إنك مكروب)",
164
+ "ترى اليوم وَشْطاطُ (ضائعاً)",
165
+ "لم تذقي فيه غير لقيمة"],
166
+ "ar_2": [
167
+ "حَيّوا تُماضِرَ وَاِربَعوا صَحبي\t\tوَقِفوا فَإِنَّ وُقوفَكُم حَسبي",
168
+ "أَخُناسُ قَد هامَ الفُؤادُ بِكُم\t\tوَأَصابَهُ تَبَلٌ مِنَ الحُبِّ",
169
+ "ما إِن رَأَيتُ وَلا سَمِعتُ بِهِ\t\tكَاليَومِ طالي أَينُقٍ جُربِ",
170
+ "مُتَبَذِّلاً تَبدو مَحاسِنُهُ\t\tضَعُ الهِناءَ مَواضِعَ النُقبِ",
171
+ "مُتَحَسِّراً نَضَحَ الهِناءَ بِهِ\t\tضحَ العَبيرِ بِرَيطَةِ العَصبِ",
172
+ "فَسَليهُمُ عَنّي خُناسُ إِذا\t\tعَضَّ الجَميعَ الخَطبُ ما خَطبي"]
173
+ }
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  transformers
2
- pandas
3
  torch
4
- tensorflow
 
 
 
1
  transformers
 
2
  torch
3
+ streamlit
4
+ icu_tokenizer
5
+ langid