Spaces:
Running
Running
antoinelouis
commited on
Commit
•
d48edf5
1
Parent(s):
85c17c2
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,6 @@ import textwrap
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
import streamlit as st
|
10 |
-
from tqdm.auto import tqdm
|
11 |
from collections import Counter
|
12 |
from tokenizers import Tokenizer
|
13 |
import plotly.graph_objects as go
|
@@ -35,18 +34,20 @@ MODELS = [
|
|
35 |
|
36 |
def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
|
37 |
"""
|
38 |
-
Estimate the most common tokens in the language. You should first download the 1M sentences dataset
|
39 |
-
Source: https://wortschatz.uni-leipzig.de/en/download/English
|
40 |
"""
|
41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
42 |
if os.path.exists(sentences_file):
|
43 |
-
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
44 |
my_bar = st.progress(0)
|
45 |
-
|
|
|
46 |
for i, text in enumerate(df.text):
|
47 |
-
counter.update(
|
48 |
my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
|
49 |
-
|
|
|
|
|
50 |
else:
|
51 |
raise FileNotFoundError
|
52 |
|
@@ -119,8 +120,6 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
|
|
119 |
# Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
|
120 |
tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
|
121 |
original_vocab = tokenizer_json['model']['vocab']
|
122 |
-
|
123 |
-
# Build a mapping from tokens to their original IDs
|
124 |
original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
|
125 |
|
126 |
# Filter out the tokens to remove and reassign new IDs
|
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
import streamlit as st
|
|
|
10 |
from collections import Counter
|
11 |
from tokenizers import Tokenizer
|
12 |
import plotly.graph_objects as go
|
|
|
34 |
|
35 |
def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
|
36 |
"""
|
37 |
+
Estimate the most common tokens in the language. You should first download the 1M sentences dataset
|
38 |
+
for the desired language. Source: https://wortschatz.uni-leipzig.de/en/download/English
|
39 |
"""
|
40 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
41 |
if os.path.exists(sentences_file):
|
|
|
42 |
my_bar = st.progress(0)
|
43 |
+
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
44 |
+
counter = Counter(tokenizer.all_special_ids)
|
45 |
for i, text in enumerate(df.text):
|
46 |
+
counter.update(tokid for tokid in tokenizer.encode(text))
|
47 |
my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
|
48 |
+
filtered_token_ids = sorted(counter.keys())
|
49 |
+
filtered_tokens = tokenizer.convert_ids_to_tokens(filtered_token_ids)
|
50 |
+
return set(filtered_tokens)
|
51 |
else:
|
52 |
raise FileNotFoundError
|
53 |
|
|
|
120 |
# Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
|
121 |
tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
|
122 |
original_vocab = tokenizer_json['model']['vocab']
|
|
|
|
|
123 |
original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
|
124 |
|
125 |
# Filter out the tokens to remove and reassign new IDs
|