antoinelouis commited on
Commit
d48edf5
1 Parent(s): 85c17c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -7,7 +7,6 @@ import textwrap
7
  import numpy as np
8
  import pandas as pd
9
  import streamlit as st
10
- from tqdm.auto import tqdm
11
  from collections import Counter
12
  from tokenizers import Tokenizer
13
  import plotly.graph_objects as go
@@ -35,18 +34,20 @@ MODELS = [
35
 
36
  def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
37
  """
38
- Estimate the most common tokens in the language. You should first download the 1M sentences dataset for the desired language.
39
- Source: https://wortschatz.uni-leipzig.de/en/download/English
40
  """
41
  sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
42
  if os.path.exists(sentences_file):
43
- df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
44
  my_bar = st.progress(0)
45
- counter = Counter(tokenizer.all_special_tokens)
 
46
  for i, text in enumerate(df.text):
47
- counter.update(tok for tok in tokenizer.tokenize(text))
48
  my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
49
- return set(counter)
 
 
50
  else:
51
  raise FileNotFoundError
52
 
@@ -119,8 +120,6 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
119
  # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
120
  tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
121
  original_vocab = tokenizer_json['model']['vocab']
122
-
123
- # Build a mapping from tokens to their original IDs
124
  original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
125
 
126
  # Filter out the tokens to remove and reassign new IDs
 
7
  import numpy as np
8
  import pandas as pd
9
  import streamlit as st
 
10
  from collections import Counter
11
  from tokenizers import Tokenizer
12
  import plotly.graph_objects as go
 
34
 
35
  def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
36
  """
37
+ Estimate the most common tokens in the language. You should first download the 1M sentences dataset
38
+ for the desired language. Source: https://wortschatz.uni-leipzig.de/en/download/English
39
  """
40
  sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
41
  if os.path.exists(sentences_file):
 
42
  my_bar = st.progress(0)
43
+ df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
44
+ counter = Counter(tokenizer.all_special_ids)
45
  for i, text in enumerate(df.text):
46
+ counter.update(tokid for tokid in tokenizer.encode(text))
47
  my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
48
+ filtered_token_ids = sorted(counter.keys())
49
+ filtered_tokens = tokenizer.convert_ids_to_tokens(filtered_token_ids)
50
+ return set(filtered_tokens)
51
  else:
52
  raise FileNotFoundError
53
 
 
120
  # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
121
  tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
122
  original_vocab = tokenizer_json['model']['vocab']
 
 
123
  original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
124
 
125
  # Filter out the tokens to remove and reassign new IDs