FremyCompany commited on
Commit
9a496bb
·
verified ·
1 Parent(s): 71c0f27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -13
app.py CHANGED
@@ -45,12 +45,12 @@ if reroll_button:
45
  sent1 = mid_st.text_input("Type your second sentence here", value=random_sentences[1], key="sent1")
46
  else:
47
  # Allow the user to input two sentences
48
- sent0 = mid_st.text_input("Type your first sentence here", value="De fait, mon mari ne parlait jamais de ses affaires avec moi.", key="sent0")
49
- sent1 = mid_st.text_input("Type your second sentence here", value="M'n man had het met mij nooit over z'n zaken, inderdaad.", key="sent1")
50
 
51
 
52
  # Display the mapping between the two sentences
53
- DEBUG = False
54
  if DEBUG:
55
 
56
  # Use some dummy data
@@ -71,7 +71,7 @@ else:
71
  import torch.nn.functional as F
72
  from transformers import AutoTokenizer, AutoModel
73
 
74
- @st.cache_data
75
  def load_model_and_tokenizer():
76
  device = "cuda" if torch.cuda.is_available() else "cpu"
77
  model = AutoModel.from_pretrained('Parallia/Fairly-Multilingual-ModernBERT-Embed-BE', trust_remote_code=True).to(device)
@@ -80,16 +80,21 @@ else:
80
 
81
  model, tokenizer = load_model_and_tokenizer()
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # Encode the sentences
84
- sentences = [sent0, sent1]
85
- tokens = []
86
- embeddings = []
87
- for sentence in sentences:
88
- with torch.no_grad():
89
- encoded_sentence = tokenizer(sentence, padding=False, truncation=True, return_tensors="pt").to(model.device)
90
- embedded_sentence = model(**encoded_sentence).last_hidden_state[0].detach().cpu().clone()
91
- tokens.append(tokenizer.tokenize(sentence))
92
- embeddings.append(embedded_sentence)
93
 
94
  # Calculate the cross-token similarity
95
  token_similarities = F.normalize(embeddings[0], dim=1) @ F.normalize(embeddings[1], dim=1).T
@@ -148,6 +153,8 @@ html += """
148
  animation-duration: 15s;
149
  animation-timing-function: steps(14, start);
150
  animation-iteration-count: infinite;
 
 
151
  text-decoration: underline 0.3em;
152
  text-decoration-skip: none;
153
  text-decoration-skip-ink: none;
@@ -180,6 +187,8 @@ for j in range(len(tokens[1])):
180
  for i in range(len(tokens[0])):
181
  html += f"""--p{i}: {token_probabilities_12[i][j]}; """
182
  html += """}"""
 
 
183
  for i in range(len(tokens[0])):
184
  html += f"""
185
  body:has(#sent0 span:nth-child({i+1}):hover) span {{ --p: var(--p{i}) !important; }}"""
 
45
  sent1 = mid_st.text_input("Type your second sentence here", value=random_sentences[1], key="sent1")
46
  else:
47
  # Allow the user to input two sentences
48
+ sent0 = mid_st.text_input("Type your first sentence here", value="De fait, mon mari ne parlait jamais de ses affaires avec moi." if "sent0" not in st.session_state else st.session_state.sent0, key="sent0")
49
+ sent1 = mid_st.text_input("Type your second sentence here", value="M'n man had het met mij nooit over z'n zaken, inderdaad." if "sent1" not in st.session_state else st.session_state.sent1, key="sent1")
50
 
51
 
52
  # Display the mapping between the two sentences
53
+ DEBUG = True
54
  if DEBUG:
55
 
56
  # Use some dummy data
 
71
  import torch.nn.functional as F
72
  from transformers import AutoTokenizer, AutoModel
73
 
74
+ @st.cache_resource
75
  def load_model_and_tokenizer():
76
  device = "cuda" if torch.cuda.is_available() else "cpu"
77
  model = AutoModel.from_pretrained('Parallia/Fairly-Multilingual-ModernBERT-Embed-BE', trust_remote_code=True).to(device)
 
80
 
81
  model, tokenizer = load_model_and_tokenizer()
82
 
83
+ @st.cache_data
84
+ def encode_sentences(sent0, sent1):
85
+ sentences = [sent0, sent1]
86
+ tokens = []
87
+ embeddings = []
88
+ for sentence in sentences:
89
+ with torch.no_grad():
90
+ encoded_sentence = tokenizer(sentence, padding=False, truncation=True, return_tensors="pt").to(model.device)
91
+ embedded_sentence = model(**encoded_sentence).last_hidden_state[0].detach().cpu().clone()
92
+ tokens.append(tokenizer.tokenize(sentence))
93
+ embeddings.append(embedded_sentence)
94
+ return tokens, embeddings
95
+
96
  # Encode the sentences
97
+ tokens, embeddings = encode_sentences(sent0, sent1)
 
 
 
 
 
 
 
 
98
 
99
  # Calculate the cross-token similarity
100
  token_similarities = F.normalize(embeddings[0], dim=1) @ F.normalize(embeddings[1], dim=1).T
 
153
  animation-duration: 15s;
154
  animation-timing-function: steps(14, start);
155
  animation-iteration-count: infinite;
156
+ text-decoration: underline;
157
+ text-decoration: underline double;
158
  text-decoration: underline 0.3em;
159
  text-decoration-skip: none;
160
  text-decoration-skip-ink: none;
 
187
  for i in range(len(tokens[0])):
188
  html += f"""--p{i}: {token_probabilities_12[i][j]}; """
189
  html += """}"""
190
+ html += """
191
+ body:has(#sent0:hover,#sent1:hover) span { --p: 0 !important; animation-play-state: paused; }"""
192
  for i in range(len(tokens[0])):
193
  html += f"""
194
  body:has(#sent0 span:nth-child({i+1}):hover) span {{ --p: var(--p{i}) !important; }}"""