Spaces:

antoinelouis
/

mtem-pruner

Running

App Files Files Community

antoinelouis commited on Oct 6, 2024

Commit

f2f343a

verified ·

1 Parent(s): 2a67ef4

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -98

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import csv
 import json
 import torch
 import shutil
-import requests
 import textwrap
 import numpy as np
 import pandas as pd
@@ -78,15 +77,16 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
     translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
     return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
-def push_to_hub(username: str, token: str, model_dir: str, private: bool = False):
-    _ = whoami(token=token)
-    api = HfApi(endpoint="https://huggingface.co", token=token)
-    repo_id = f"{username}/{model_dir.split('/')[-1]}"
     api.create_repo(repo_id=repo_id, repo_type="model", private=private)
     api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
-def prune_model(model_name: str, language: str, username: str, token: str):
-    st.markdown(f"- Pruning the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only. *Let's go!*")
     # Load the model and its tokenizer
     model, tokenizer = load_model_and_tokenizer(model_name)
@@ -97,7 +97,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
     embedding_params = count_parameters(model, layer_name="embeddings")
     st.markdown(
-        f"- The model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
         f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
         f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
         f"with **{encoder_params/1e6:.1f}M** parameters only."
@@ -110,77 +110,82 @@ def prune_model(model_name: str, language: str, username: str, token: str):
         f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
     )
-    st.markdown("- *Updating the tokenizer...*")
-    outdir = f"{language}-{model_name.split('/')[-1]}"
-    # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
-    tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
-    original_vocab = tokenizer_json['model']['vocab']
-    # Build a mapping from tokens to their original IDs
-    original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
-    # Filter out the tokens to remove and reassign new IDs
-    new_id = 0
-    new_token_to_id = {}
-    new_id_to_original_id = {}
-    filtered_vocab_entries = []
-    for token, score in original_vocab:
-        if token in filtered_tokens:
-            filtered_vocab_entries.append([token, score])
-            new_token_to_id[token] = new_id
-            new_id_to_original_id[new_id] = original_token_to_id[token]
-            new_id += 1
-    # Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
-    tokenizer_json['model']['vocab'] = filtered_vocab_entries
-    new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
-    # Create a new tokenizer instance and save it
-    new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
-    new_tokenizer.save_pretrained(outdir)
-    st.markdown("- *Updating the embedding matrix...*")
-    new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
-    # Create a new embedding matrix and map the original vectors to their new IDs
-    original_embeddings = new_model.get_input_embeddings().weight.data
-    new_embeddings = torch.nn.Embedding(
-        num_embeddings=new_tokenizer.vocab_size,
-        embedding_dim=model.config.hidden_size,
-        padding_idx=new_tokenizer.pad_token_id,
-    )
-    for new_id in range(new_tokenizer.vocab_size):
-        original_id = new_id_to_original_id.get(new_id)
-        new_embeddings.weight.data[new_id] = original_embeddings[original_id]
-    new_model.set_input_embeddings(new_embeddings)
-    new_model.config.vocab_size = new_tokenizer.vocab_size
-    new_model.save_pretrained(outdir)
-    # Test the conversion
-    test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
-    st.markdown(f"""- *Verifying everything worked as expected with the following test sentence: "{test_sentence}"*""")
-    assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
-    assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
-    with torch.inference_mode():
-        emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
-        emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
-    diff = np.abs(emb1 - emb2).max()
-    assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
-    st.success("The conversion **succeeded**! You can verify it by looking at the output *[cls]* token embedding:")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.markdown("Original model:")
-        st.code(f"{emb1.tolist()}")
-    with col2:
-        st.markdown("Pruned model:")
-        st.code(f"{emb2.tolist()}")
     # Show visually the result of the pruning process
     pruned_all_params = count_parameters(new_model)
@@ -201,7 +206,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
     st.plotly_chart(fig)
     # Add a README to the pruned model repo
-    new_model_name = f"{username}/{outdir.split('/')[-1]}"
     readme_content = textwrap.dedent(f"""
     ---
     pipeline_tag: sentence-similarity
@@ -213,19 +218,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
     - pruned
     library_name: sentence-transformers
     base_model: {model_name}
-    base_model_relation: pruned
     ---
-    # {new_model_name.split('/')[-1]}
-    This model is a pruned version of [{model_name}](https://huggingface.co/{model_name}) for the {language.capitalize()} language.
-    It was created by the [Multilingual Text Embedding Model Pruner](https://huggingface.co/spaces/antoinelouis/mteb-pruner) space,
-    which removed tokens not commonly used in {language.capitalize()} from the original multilingual model's vocabulary and adjsuted
-    the model's embedding matrix accordingly.
-    This pruned model should perform similarly to the original model for {language.capitalize()} language tasks, but with a much smaller
-    memory footprint ({100 - pruned_all_params/all_params*100:.1f}% smaller). However, it may not perform well for other languages present
-    in the original multilingual model.
     ## Usage
@@ -238,13 +240,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
     ```
     """)
     with open(os.path.join(outdir, "README.md"), "w") as f:
         f.write(readme_content)
-    st.markdown("- *Pushing the pruned model to your Hugging Face account...*")
-    push_to_hub(username, token, outdir)
-    shutil.rmtree(outdir)
     st.markdown("Done! You can now load your pruned model like this:")
     st.code(f"""
@@ -261,7 +266,7 @@ def main():
     st.markdown("""
     This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
-    1. 🌎 Takes a popular text embedding model that was trained on many languages
     2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
      3. 🚀 Gives you a smaller model that works just as well for your chosen language
@@ -279,14 +284,17 @@ def main():
         options=list(LANGUAGES.keys()),
         format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
     )
-    username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
-    token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
     if st.button("Prune Model"):
-        if not username or not token:
             st.error("Your HF username and access token is required to save the pruned model on your account.")
         else:
-            prune_model(model_name, language, username, token)
     st.markdown(
         """

 import json
 import torch
 import shutil
 import textwrap
 import numpy as np
 import pandas as pd
     translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
     return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
+def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
+    print(f"'{hf_token}'")
+    _ = whoami(token=hf_token)
+    api = HfApi(endpoint="https://huggingface.co", token=hf_token)
+    repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
     api.create_repo(repo_id=repo_id, repo_type="model", private=private)
     api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
+def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
+    st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
     # Load the model and its tokenizer
     model, tokenizer = load_model_and_tokenizer(model_name)
     embedding_params = count_parameters(model, layer_name="embeddings")
     st.markdown(
+        f"- The original model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
         f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
         f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
         f"with **{encoder_params/1e6:.1f}M** parameters only."
         f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
     )
+    with st.status("Pruning the model...", expanded=True) as status:
+        st.write("- *Updating the tokenizer*")
+        outdir = f"{language}-{model_name.split('/')[-1]}"
+        # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
+        tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
+        original_vocab = tokenizer_json['model']['vocab']
+        # Build a mapping from tokens to their original IDs
+        original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
+        # Filter out the tokens to remove and reassign new IDs
+        new_id = 0
+        new_token_to_id = {}
+        new_id_to_original_id = {}
+        filtered_vocab_entries = []
+        for token, score in original_vocab:
+            if token in filtered_tokens:
+                filtered_vocab_entries.append([token, score])
+                new_token_to_id[token] = new_id
+                new_id_to_original_id[new_id] = original_token_to_id[token]
+                new_id += 1
+        # Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
+        tokenizer_json['model']['vocab'] = filtered_vocab_entries
+        new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
+        # Create a new tokenizer instance and save it
+        new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
+        new_tokenizer.save_pretrained(outdir)
+        st.write("- *Updating the embedding matrix*")
+        new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+        # Create a new embedding matrix and map the original vectors to their new IDs
+        original_embeddings = new_model.get_input_embeddings().weight.data
+        new_embeddings = torch.nn.Embedding(
+            num_embeddings=new_tokenizer.vocab_size,
+            embedding_dim=model.config.hidden_size,
+            padding_idx=new_tokenizer.pad_token_id,
+        )
+        for new_id in range(new_tokenizer.vocab_size):
+            original_id = new_id_to_original_id.get(new_id)
+            new_embeddings.weight.data[new_id] = original_embeddings[original_id]
+        new_model.set_input_embeddings(new_embeddings)
+        new_model.config.vocab_size = new_tokenizer.vocab_size
+        new_model.save_pretrained(outdir)
+        status.update(state="complete", expanded=True)
+    with st.status("Testing the conversion...", expanded=True) as status:
+        st.write(f"- *Checking the pruned tokenizer*")
+        assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
+        assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
+        st.write(f"- *Checking the pruned model*")
+        test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
+        with torch.inference_mode():
+            emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
+            emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
+        diff = np.abs(emb1 - emb2).max()
+        assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
+        st.write(f"""All good! The output *[cls]* token embedding of the test sentence *"{test_sentence}"* should be similar:""")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("Original model:")
+            st.code(f"{emb1.tolist()}")
+        with col2:
+            st.markdown("Pruned model:")
+            st.code(f"{emb2.tolist()}")
+        status.update(state="complete", expanded=True)
     # Show visually the result of the pruning process
     pruned_all_params = count_parameters(new_model)
     st.plotly_chart(fig)
     # Add a README to the pruned model repo
+    new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
     readme_content = textwrap.dedent(f"""
     ---
     pipeline_tag: sentence-similarity
     - pruned
     library_name: sentence-transformers
     base_model: {model_name}
+    base_model_relation: quantized
     ---
+    # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
+    This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
+    for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
+    This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
+    memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
+    commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
     ## Usage
     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
     ```
+    **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
     """)
     with open(os.path.join(outdir, "README.md"), "w") as f:
         f.write(readme_content)
+    with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
+        #push_to_hub(hf_username, hf_token, outdir)
+        shutil.rmtree(outdir)
+        status.update(state="complete", expanded=False)
     st.markdown("Done! You can now load your pruned model like this:")
     st.code(f"""
     st.markdown("""
     This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
+    1. 🌎 Takes a state-of-the-art text embedding model that was trained on many languages
     2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
      3. 🚀 Gives you a smaller model that works just as well for your chosen language
         options=list(LANGUAGES.keys()),
         format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
     )
+    col1, col2 = st.columns(2)
+    with col1:
+        hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
+    with col2:
+        hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
     if st.button("Prune Model"):
+        if not hf_username or not hf_token:
             st.error("Your HF username and access token is required to save the pruned model on your account.")
         else:
+            prune_model(model_name, language, hf_username, hf_token)
     st.markdown(
         """