Spaces:

Parallia
/

Fairly-Multilingual-ModernBERT-Token-Alignment

Running

App Files Files Community

FremyCompany commited on 17 days ago

Commit

9a496bb

verified ·

1 Parent(s): 71c0f27

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -13

app.py CHANGED Viewed

@@ -45,12 +45,12 @@ if reroll_button:
     sent1 = mid_st.text_input("Type your second sentence here", value=random_sentences[1], key="sent1")
 else:
     # Allow the user to input two sentences
-    sent0 = mid_st.text_input("Type your first sentence here", value="De fait, mon mari ne parlait jamais de ses affaires avec moi.", key="sent0")
-    sent1 = mid_st.text_input("Type your second sentence here", value="M'n man had het met mij nooit over z'n zaken, inderdaad.", key="sent1")
 # Display the mapping between the two sentences
-DEBUG = False
 if DEBUG:
     # Use some dummy data
@@ -71,7 +71,7 @@ else:
     import torch.nn.functional as F
     from transformers import AutoTokenizer, AutoModel
-    @st.cache_data
     def load_model_and_tokenizer():
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = AutoModel.from_pretrained('Parallia/Fairly-Multilingual-ModernBERT-Embed-BE', trust_remote_code=True).to(device)
@@ -80,16 +80,21 @@ else:
     model, tokenizer = load_model_and_tokenizer()
     # Encode the sentences
-    sentences = [sent0, sent1]
-    tokens = []
-    embeddings = []
-    for sentence in sentences:
-        with torch.no_grad():
-            encoded_sentence = tokenizer(sentence, padding=False, truncation=True, return_tensors="pt").to(model.device)
-            embedded_sentence = model(**encoded_sentence).last_hidden_state[0].detach().cpu().clone()
-            tokens.append(tokenizer.tokenize(sentence))
-            embeddings.append(embedded_sentence)
     # Calculate the cross-token similarity
     token_similarities = F.normalize(embeddings[0], dim=1) @ F.normalize(embeddings[1], dim=1).T
@@ -148,6 +153,8 @@ html += """
         animation-duration: 15s;
         animation-timing-function: steps(14, start);
         animation-iteration-count: infinite;
         text-decoration: underline 0.3em;
         text-decoration-skip: none;
         text-decoration-skip-ink: none;
@@ -180,6 +187,8 @@ for j in range(len(tokens[1])):
     for i in range(len(tokens[0])):
         html += f"""--p{i}: {token_probabilities_12[i][j]}; """
     html += """}"""
 for i in range(len(tokens[0])):
     html += f"""
     body:has(#sent0 span:nth-child({i+1}):hover) span {{ --p: var(--p{i}) !important; }}"""

     sent1 = mid_st.text_input("Type your second sentence here", value=random_sentences[1], key="sent1")
 else:
     # Allow the user to input two sentences
+    sent0 = mid_st.text_input("Type your first sentence here", value="De fait, mon mari ne parlait jamais de ses affaires avec moi." if "sent0" not in st.session_state else st.session_state.sent0, key="sent0")
+    sent1 = mid_st.text_input("Type your second sentence here", value="M'n man had het met mij nooit over z'n zaken, inderdaad." if "sent1" not in st.session_state else st.session_state.sent1, key="sent1")
 # Display the mapping between the two sentences
+DEBUG = True
 if DEBUG:
     # Use some dummy data
     import torch.nn.functional as F
     from transformers import AutoTokenizer, AutoModel
+    @st.cache_resource
     def load_model_and_tokenizer():
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = AutoModel.from_pretrained('Parallia/Fairly-Multilingual-ModernBERT-Embed-BE', trust_remote_code=True).to(device)
     model, tokenizer = load_model_and_tokenizer()
+    @st.cache_data
+    def encode_sentences(sent0, sent1):
+        sentences = [sent0, sent1]
+        tokens = []
+        embeddings = []
+        for sentence in sentences:
+            with torch.no_grad():
+                encoded_sentence = tokenizer(sentence, padding=False, truncation=True, return_tensors="pt").to(model.device)
+                embedded_sentence = model(**encoded_sentence).last_hidden_state[0].detach().cpu().clone()
+                tokens.append(tokenizer.tokenize(sentence))
+                embeddings.append(embedded_sentence)
+        return tokens, embeddings
     # Encode the sentences
+    tokens, embeddings = encode_sentences(sent0, sent1)
     # Calculate the cross-token similarity
     token_similarities = F.normalize(embeddings[0], dim=1) @ F.normalize(embeddings[1], dim=1).T
         animation-duration: 15s;
         animation-timing-function: steps(14, start);
         animation-iteration-count: infinite;
+        text-decoration: underline;
+        text-decoration: underline double;
         text-decoration: underline 0.3em;
         text-decoration-skip: none;
         text-decoration-skip-ink: none;
     for i in range(len(tokens[0])):
         html += f"""--p{i}: {token_probabilities_12[i][j]}; """
     html += """}"""
+html += """
+    body:has(#sent0:hover,#sent1:hover) span { --p: 0 !important; animation-play-state: paused; }"""
 for i in range(len(tokens[0])):
     html += f"""
     body:has(#sent0 span:nth-child({i+1}):hover) span {{ --p: var(--p{i}) !important; }}"""