Spaces:

antoinelouis
/

mtem-pruner

Running

App Files Files Community

antoinelouis commited on Oct 6

Commit

d48edf5

•

1 Parent(s): 85c17c2

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -9

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import textwrap
 import numpy as np
 import pandas as pd
 import streamlit as st
-from tqdm.auto import tqdm
 from collections import Counter
 from tokenizers import Tokenizer
 import plotly.graph_objects as go
@@ -35,18 +34,20 @@ MODELS = [
 def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
     """
-    Estimate the most common tokens in the language. You should first download the 1M sentences dataset for the desired language.
-    Source: https://wortschatz.uni-leipzig.de/en/download/English
     """
     sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
     if os.path.exists(sentences_file):
-        df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
         my_bar = st.progress(0)
-        counter = Counter(tokenizer.all_special_tokens)
         for i, text in enumerate(df.text):
-            counter.update(tok for tok in tokenizer.tokenize(text))
             my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
-        return set(counter)
     else:
         raise FileNotFoundError
@@ -119,8 +120,6 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
         # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
         tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
         original_vocab = tokenizer_json['model']['vocab']
-        # Build a mapping from tokens to their original IDs
         original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
         # Filter out the tokens to remove and reassign new IDs

 import numpy as np
 import pandas as pd
 import streamlit as st
 from collections import Counter
 from tokenizers import Tokenizer
 import plotly.graph_objects as go
 def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
     """
+    Estimate the most common tokens in the language. You should first download the 1M sentences dataset
+    for the desired language. Source: https://wortschatz.uni-leipzig.de/en/download/English
     """
     sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
     if os.path.exists(sentences_file):
         my_bar = st.progress(0)
+        df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
+        counter = Counter(tokenizer.all_special_ids)
         for i, text in enumerate(df.text):
+            counter.update(tokid for tokid in tokenizer.encode(text))
             my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
+        filtered_token_ids = sorted(counter.keys())
+        filtered_tokens = tokenizer.convert_ids_to_tokens(filtered_token_ids)
+        return set(filtered_tokens)
     else:
         raise FileNotFoundError
         # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
         tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
         original_vocab = tokenizer_json['model']['vocab']
         original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
         # Filter out the tokens to remove and reassign new IDs