Spaces:
Runtime error
Runtime error
Rearranging some of the figures
Browse files
app.py
CHANGED
@@ -18,12 +18,16 @@ def load_data():
|
|
18 |
def reload_example_text_data():
|
19 |
random_id = random.choice(val_data['id'])
|
20 |
tempdf = subset_df[subset_df['id']==random_id]
|
21 |
-
tempdf.
|
|
|
22 |
tempdf = tempdf[['iso', 'text', tokenizer_name]]
|
23 |
tempdf.columns=['ISO', 'Text', 'Num Tokens']
|
24 |
tempdf.sort_values(by='ISO', inplace=True)
|
25 |
st.session_state.examplesdf = tempdf
|
26 |
|
|
|
|
|
|
|
27 |
# TODO allow new tokenizers from HF
|
28 |
tokenizer_names_to_test = [
|
29 |
"openai/gpt4",
|
@@ -57,6 +61,8 @@ with st.sidebar:
|
|
57 |
val_data = load_data()
|
58 |
st.success(f'Data loaded: {len(val_data)}')
|
59 |
|
|
|
|
|
60 |
with st.expander('Data Source'):
|
61 |
st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
|
62 |
|
@@ -91,34 +97,64 @@ with st.container():
|
|
91 |
if tokenizer_name in val_data.columns:
|
92 |
subset_df = val_data[val_data.lang.isin(languages)]
|
93 |
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
94 |
-
|
95 |
-
st.header('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
|
97 |
|
98 |
fig.update_layout(
|
99 |
-
title=dict(text=
|
100 |
-
# title=
|
101 |
xaxis_title="Number of Tokens",
|
102 |
yaxis_title="Density",
|
|
|
103 |
# title_font_family='"Source Sans Pro", sans-serif'
|
104 |
)
|
105 |
st.plotly_chart(fig, use_container_width=True)
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
for i, _lang in enumerate(languages):
|
110 |
-
metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
|
111 |
|
112 |
|
113 |
st.subheader('Example Texts')
|
114 |
-
|
115 |
reload_example_text_data()
|
116 |
if st.button("π Randomly sample"):
|
117 |
reload_example_text_data()
|
118 |
-
|
119 |
st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
|
120 |
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
|
124 |
|
@@ -126,6 +162,8 @@ with st.container():
|
|
126 |
|
127 |
|
128 |
|
|
|
|
|
129 |
with st.expander("About the project"):
|
130 |
st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
|
131 |
|
|
|
18 |
def reload_example_text_data():
|
19 |
random_id = random.choice(val_data['id'])
|
20 |
tempdf = subset_df[subset_df['id']==random_id]
|
21 |
+
tempdf.rename(columns={'lang': 'Language'}, inplace=True)
|
22 |
+
tempdf.set_index('Language', inplace=True)
|
23 |
tempdf = tempdf[['iso', 'text', tokenizer_name]]
|
24 |
tempdf.columns=['ISO', 'Text', 'Num Tokens']
|
25 |
tempdf.sort_values(by='ISO', inplace=True)
|
26 |
st.session_state.examplesdf = tempdf
|
27 |
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
# TODO allow new tokenizers from HF
|
32 |
tokenizer_names_to_test = [
|
33 |
"openai/gpt4",
|
|
|
61 |
val_data = load_data()
|
62 |
st.success(f'Data loaded: {len(val_data)}')
|
63 |
|
64 |
+
# st.write(val_data.columns, val_data.head())
|
65 |
+
|
66 |
with st.expander('Data Source'):
|
67 |
st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
|
68 |
|
|
|
97 |
if tokenizer_name in val_data.columns:
|
98 |
subset_df = val_data[val_data.lang.isin(languages)]
|
99 |
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
100 |
+
|
101 |
+
# st.header(f'Comparing languages for {tokenizer_name}')
|
102 |
+
|
103 |
+
st.subheader(f'Median Token Length for `{tokenizer_name}`')
|
104 |
+
metric_cols = st.columns(len(languages))
|
105 |
+
for i, _lang in enumerate(languages):
|
106 |
+
metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
|
107 |
+
|
108 |
+
|
109 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
|
110 |
|
111 |
fig.update_layout(
|
112 |
+
title=dict(text='Token Distribution', font=dict(size=25), automargin=True, yref='paper', ),
|
113 |
+
# title='Distribution of tokens',
|
114 |
xaxis_title="Number of Tokens",
|
115 |
yaxis_title="Density",
|
116 |
+
height=500
|
117 |
# title_font_family='"Source Sans Pro", sans-serif'
|
118 |
)
|
119 |
st.plotly_chart(fig, use_container_width=True)
|
120 |
|
121 |
+
|
122 |
+
|
|
|
|
|
123 |
|
124 |
|
125 |
st.subheader('Example Texts')
|
|
|
126 |
reload_example_text_data()
|
127 |
if st.button("π Randomly sample"):
|
128 |
reload_example_text_data()
|
|
|
129 |
st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
|
130 |
|
131 |
|
132 |
+
# val_median_data = val_data.groupby('lang')[tokenizer_name].apply(np.median)
|
133 |
+
# val_median_data = val_median_data.sort_values(ascending=False)
|
134 |
+
# val_median_data = val_median_data.reset_index()
|
135 |
+
# # val_median_data = val_median_data[val_median_data.lang.isin(languages)]
|
136 |
+
# val_median_data[tokenizer_name] = val_median_data[tokenizer_name].astype(int)
|
137 |
+
# val_median_data.columns = ['Language', 'Median Number of Tokens']
|
138 |
+
# # st.write(val_median_data.head())
|
139 |
+
# bar_fig = px.bar(
|
140 |
+
# val_median_data,
|
141 |
+
# y='Language',
|
142 |
+
# x='Median Number of Tokens',
|
143 |
+
# text_auto='d',
|
144 |
+
# orientation='h',
|
145 |
+
# hover_data=val_median_data.columns,
|
146 |
+
# height=1000,
|
147 |
+
# )
|
148 |
+
# bar_fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False)
|
149 |
+
# bar_fig.update_layout(
|
150 |
+
# title=dict(text='Comparison of median token lengths',
|
151 |
+
# font=dict(size=20),
|
152 |
+
# automargin=True, yref='paper', ),
|
153 |
+
# )
|
154 |
+
# st.plotly_chart(bar_fig, use_container_width=True)
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
|
159 |
|
160 |
|
|
|
162 |
|
163 |
|
164 |
|
165 |
+
|
166 |
+
|
167 |
with st.expander("About the project"):
|
168 |
st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
|
169 |
|