antoinelouis commited on
Commit
f2f343a
1 Parent(s): 2a67ef4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -98
app.py CHANGED
@@ -3,7 +3,6 @@ import csv
3
  import json
4
  import torch
5
  import shutil
6
- import requests
7
  import textwrap
8
  import numpy as np
9
  import pandas as pd
@@ -78,15 +77,16 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
78
  translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
79
  return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
80
 
81
- def push_to_hub(username: str, token: str, model_dir: str, private: bool = False):
82
- _ = whoami(token=token)
83
- api = HfApi(endpoint="https://huggingface.co", token=token)
84
- repo_id = f"{username}/{model_dir.split('/')[-1]}"
 
85
  api.create_repo(repo_id=repo_id, repo_type="model", private=private)
86
  api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
87
 
88
- def prune_model(model_name: str, language: str, username: str, token: str):
89
- st.markdown(f"- Pruning the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only. *Let's go!*")
90
 
91
  # Load the model and its tokenizer
92
  model, tokenizer = load_model_and_tokenizer(model_name)
@@ -97,7 +97,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
97
  embedding_params = count_parameters(model, layer_name="embeddings")
98
 
99
  st.markdown(
100
- f"- The model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
101
  f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
102
  f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
103
  f"with **{encoder_params/1e6:.1f}M** parameters only."
@@ -110,77 +110,82 @@ def prune_model(model_name: str, language: str, username: str, token: str):
110
  f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
111
  )
112
 
113
- st.markdown("- *Updating the tokenizer...*")
114
- outdir = f"{language}-{model_name.split('/')[-1]}"
115
-
116
- # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
117
- tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
118
- original_vocab = tokenizer_json['model']['vocab']
119
-
120
- # Build a mapping from tokens to their original IDs
121
- original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
122
-
123
- # Filter out the tokens to remove and reassign new IDs
124
- new_id = 0
125
- new_token_to_id = {}
126
- new_id_to_original_id = {}
127
- filtered_vocab_entries = []
128
-
129
- for token, score in original_vocab:
130
- if token in filtered_tokens:
131
- filtered_vocab_entries.append([token, score])
132
- new_token_to_id[token] = new_id
133
- new_id_to_original_id[new_id] = original_token_to_id[token]
134
- new_id += 1
135
-
136
- # Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
137
- tokenizer_json['model']['vocab'] = filtered_vocab_entries
138
- new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
139
-
140
- # Create a new tokenizer instance and save it
141
- new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
142
- new_tokenizer.save_pretrained(outdir)
143
-
144
- st.markdown("- *Updating the embedding matrix...*")
145
- new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
146
-
147
- # Create a new embedding matrix and map the original vectors to their new IDs
148
- original_embeddings = new_model.get_input_embeddings().weight.data
149
- new_embeddings = torch.nn.Embedding(
150
- num_embeddings=new_tokenizer.vocab_size,
151
- embedding_dim=model.config.hidden_size,
152
- padding_idx=new_tokenizer.pad_token_id,
153
- )
154
-
155
- for new_id in range(new_tokenizer.vocab_size):
156
- original_id = new_id_to_original_id.get(new_id)
157
- new_embeddings.weight.data[new_id] = original_embeddings[original_id]
158
-
159
- new_model.set_input_embeddings(new_embeddings)
160
- new_model.config.vocab_size = new_tokenizer.vocab_size
161
- new_model.save_pretrained(outdir)
162
-
163
- # Test the conversion
164
- test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
165
- st.markdown(f"""- *Verifying everything worked as expected with the following test sentence: "{test_sentence}"*""")
166
-
167
- assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
168
- assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
169
-
170
- with torch.inference_mode():
171
- emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
172
- emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
173
- diff = np.abs(emb1 - emb2).max()
174
- assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
175
-
176
- st.success("The conversion **succeeded**! You can verify it by looking at the output *[cls]* token embedding:")
177
- col1, col2 = st.columns(2)
178
- with col1:
179
- st.markdown("Original model:")
180
- st.code(f"{emb1.tolist()}")
181
- with col2:
182
- st.markdown("Pruned model:")
183
- st.code(f"{emb2.tolist()}")
 
 
 
 
 
184
 
185
  # Show visually the result of the pruning process
186
  pruned_all_params = count_parameters(new_model)
@@ -201,7 +206,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
201
  st.plotly_chart(fig)
202
 
203
  # Add a README to the pruned model repo
204
- new_model_name = f"{username}/{outdir.split('/')[-1]}"
205
  readme_content = textwrap.dedent(f"""
206
  ---
207
  pipeline_tag: sentence-similarity
@@ -213,19 +218,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
213
  - pruned
214
  library_name: sentence-transformers
215
  base_model: {model_name}
216
- base_model_relation: pruned
217
  ---
218
- # {new_model_name.split('/')[-1]}
219
 
220
- This model is a pruned version of [{model_name}](https://huggingface.co/{model_name}) for the {language.capitalize()} language.
 
221
 
222
- It was created by the [Multilingual Text Embedding Model Pruner](https://huggingface.co/spaces/antoinelouis/mteb-pruner) space,
223
- which removed tokens not commonly used in {language.capitalize()} from the original multilingual model's vocabulary and adjsuted
224
- the model's embedding matrix accordingly.
225
-
226
- This pruned model should perform similarly to the original model for {language.capitalize()} language tasks, but with a much smaller
227
- memory footprint ({100 - pruned_all_params/all_params*100:.1f}% smaller). However, it may not perform well for other languages present
228
- in the original multilingual model.
229
 
230
  ## Usage
231
 
@@ -238,13 +240,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
238
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
239
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
240
  ```
 
 
241
  """)
242
  with open(os.path.join(outdir, "README.md"), "w") as f:
243
  f.write(readme_content)
244
 
245
- st.markdown("- *Pushing the pruned model to your Hugging Face account...*")
246
- push_to_hub(username, token, outdir)
247
- shutil.rmtree(outdir)
 
248
 
249
  st.markdown("Done! You can now load your pruned model like this:")
250
  st.code(f"""
@@ -261,7 +266,7 @@ def main():
261
  st.markdown("""
262
  This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
263
 
264
- 1. 🌎 Takes a popular text embedding model that was trained on many languages
265
  2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
266
  3. 🚀 Gives you a smaller model that works just as well for your chosen language
267
 
@@ -279,14 +284,17 @@ def main():
279
  options=list(LANGUAGES.keys()),
280
  format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
281
  )
282
- username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
283
- token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
 
 
 
284
 
285
  if st.button("Prune Model"):
286
- if not username or not token:
287
  st.error("Your HF username and access token is required to save the pruned model on your account.")
288
  else:
289
- prune_model(model_name, language, username, token)
290
 
291
  st.markdown(
292
  """
 
3
  import json
4
  import torch
5
  import shutil
 
6
  import textwrap
7
  import numpy as np
8
  import pandas as pd
 
77
  translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
78
  return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
79
 
80
+ def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
81
+ print(f"'{hf_token}'")
82
+ _ = whoami(token=hf_token)
83
+ api = HfApi(endpoint="https://huggingface.co", token=hf_token)
84
+ repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
85
  api.create_repo(repo_id=repo_id, repo_type="model", private=private)
86
  api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
87
 
88
+ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
89
+ st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
90
 
91
  # Load the model and its tokenizer
92
  model, tokenizer = load_model_and_tokenizer(model_name)
 
97
  embedding_params = count_parameters(model, layer_name="embeddings")
98
 
99
  st.markdown(
100
+ f"- The original model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
101
  f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
102
  f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
103
  f"with **{encoder_params/1e6:.1f}M** parameters only."
 
110
  f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
111
  )
112
 
113
+ with st.status("Pruning the model...", expanded=True) as status:
114
+ st.write("- *Updating the tokenizer*")
115
+ outdir = f"{language}-{model_name.split('/')[-1]}"
116
+
117
+ # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
118
+ tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
119
+ original_vocab = tokenizer_json['model']['vocab']
120
+
121
+ # Build a mapping from tokens to their original IDs
122
+ original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
123
+
124
+ # Filter out the tokens to remove and reassign new IDs
125
+ new_id = 0
126
+ new_token_to_id = {}
127
+ new_id_to_original_id = {}
128
+ filtered_vocab_entries = []
129
+
130
+ for token, score in original_vocab:
131
+ if token in filtered_tokens:
132
+ filtered_vocab_entries.append([token, score])
133
+ new_token_to_id[token] = new_id
134
+ new_id_to_original_id[new_id] = original_token_to_id[token]
135
+ new_id += 1
136
+
137
+ # Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
138
+ tokenizer_json['model']['vocab'] = filtered_vocab_entries
139
+ new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
140
+
141
+ # Create a new tokenizer instance and save it
142
+ new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
143
+ new_tokenizer.save_pretrained(outdir)
144
+
145
+ st.write("- *Updating the embedding matrix*")
146
+ new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
147
+
148
+ # Create a new embedding matrix and map the original vectors to their new IDs
149
+ original_embeddings = new_model.get_input_embeddings().weight.data
150
+ new_embeddings = torch.nn.Embedding(
151
+ num_embeddings=new_tokenizer.vocab_size,
152
+ embedding_dim=model.config.hidden_size,
153
+ padding_idx=new_tokenizer.pad_token_id,
154
+ )
155
+
156
+ for new_id in range(new_tokenizer.vocab_size):
157
+ original_id = new_id_to_original_id.get(new_id)
158
+ new_embeddings.weight.data[new_id] = original_embeddings[original_id]
159
+
160
+ new_model.set_input_embeddings(new_embeddings)
161
+ new_model.config.vocab_size = new_tokenizer.vocab_size
162
+ new_model.save_pretrained(outdir)
163
+
164
+ status.update(state="complete", expanded=True)
165
+
166
+ with st.status("Testing the conversion...", expanded=True) as status:
167
+ st.write(f"- *Checking the pruned tokenizer*")
168
+ assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
169
+ assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
170
+
171
+ st.write(f"- *Checking the pruned model*")
172
+ test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
173
+ with torch.inference_mode():
174
+ emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
175
+ emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
176
+ diff = np.abs(emb1 - emb2).max()
177
+ assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
178
+
179
+ st.write(f"""All good! The output *[cls]* token embedding of the test sentence *"{test_sentence}"* should be similar:""")
180
+ col1, col2 = st.columns(2)
181
+ with col1:
182
+ st.markdown("Original model:")
183
+ st.code(f"{emb1.tolist()}")
184
+ with col2:
185
+ st.markdown("Pruned model:")
186
+ st.code(f"{emb2.tolist()}")
187
+
188
+ status.update(state="complete", expanded=True)
189
 
190
  # Show visually the result of the pruning process
191
  pruned_all_params = count_parameters(new_model)
 
206
  st.plotly_chart(fig)
207
 
208
  # Add a README to the pruned model repo
209
+ new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
210
  readme_content = textwrap.dedent(f"""
211
  ---
212
  pipeline_tag: sentence-similarity
 
218
  - pruned
219
  library_name: sentence-transformers
220
  base_model: {model_name}
221
+ base_model_relation: quantized
222
  ---
223
+ # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
224
 
225
+ This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
226
+ for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
227
 
228
+ This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
229
+ memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
230
+ commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
 
 
 
 
231
 
232
  ## Usage
233
 
 
240
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
241
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
242
  ```
243
+
244
+ **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
245
  """)
246
  with open(os.path.join(outdir, "README.md"), "w") as f:
247
  f.write(readme_content)
248
 
249
+ with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
250
+ #push_to_hub(hf_username, hf_token, outdir)
251
+ shutil.rmtree(outdir)
252
+ status.update(state="complete", expanded=False)
253
 
254
  st.markdown("Done! You can now load your pruned model like this:")
255
  st.code(f"""
 
266
  st.markdown("""
267
  This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
268
 
269
+ 1. 🌎 Takes a state-of-the-art text embedding model that was trained on many languages
270
  2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
271
  3. 🚀 Gives you a smaller model that works just as well for your chosen language
272
 
 
284
  options=list(LANGUAGES.keys()),
285
  format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
286
  )
287
+ col1, col2 = st.columns(2)
288
+ with col1:
289
+ hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
290
+ with col2:
291
+ hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
292
 
293
  if st.button("Prune Model"):
294
+ if not hf_username or not hf_token:
295
  st.error("Your HF username and access token is required to save the pruned model on your account.")
296
  else:
297
+ prune_model(model_name, language, hf_username, hf_token)
298
 
299
  st.markdown(
300
  """