yenniejun commited on
Commit
5943f5c
β€’
1 Parent(s): 26e280e

Rearranging some of the figures

Browse files
Files changed (1) hide show
  1. app.py +49 -11
app.py CHANGED
@@ -18,12 +18,16 @@ def load_data():
18
  def reload_example_text_data():
19
  random_id = random.choice(val_data['id'])
20
  tempdf = subset_df[subset_df['id']==random_id]
21
- tempdf.set_index('lang', inplace=True)
 
22
  tempdf = tempdf[['iso', 'text', tokenizer_name]]
23
  tempdf.columns=['ISO', 'Text', 'Num Tokens']
24
  tempdf.sort_values(by='ISO', inplace=True)
25
  st.session_state.examplesdf = tempdf
26
 
 
 
 
27
  # TODO allow new tokenizers from HF
28
  tokenizer_names_to_test = [
29
  "openai/gpt4",
@@ -57,6 +61,8 @@ with st.sidebar:
57
  val_data = load_data()
58
  st.success(f'Data loaded: {len(val_data)}')
59
 
 
 
60
  with st.expander('Data Source'):
61
  st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
62
 
@@ -91,34 +97,64 @@ with st.container():
91
  if tokenizer_name in val_data.columns:
92
  subset_df = val_data[val_data.lang.isin(languages)]
93
  subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
94
-
95
- st.header('Compare tokenization in different languages')
 
 
 
 
 
 
 
96
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
97
 
98
  fig.update_layout(
99
- title=dict(text=tokenizer_name, font=dict(size=25), automargin=True, yref='paper', ),
100
- # title=tokenizer_name,
101
  xaxis_title="Number of Tokens",
102
  yaxis_title="Density",
 
103
  # title_font_family='"Source Sans Pro", sans-serif'
104
  )
105
  st.plotly_chart(fig, use_container_width=True)
106
 
107
- st.subheader('Median Token Length')
108
- metric_cols = st.columns(len(languages))
109
- for i, _lang in enumerate(languages):
110
- metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
111
 
112
 
113
  st.subheader('Example Texts')
114
-
115
  reload_example_text_data()
116
  if st.button("πŸ”„ Randomly sample"):
117
  reload_example_text_data()
118
-
119
  st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
120
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
 
@@ -126,6 +162,8 @@ with st.container():
126
 
127
 
128
 
 
 
129
  with st.expander("About the project"):
130
  st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
131
 
 
18
  def reload_example_text_data():
19
  random_id = random.choice(val_data['id'])
20
  tempdf = subset_df[subset_df['id']==random_id]
21
+ tempdf.rename(columns={'lang': 'Language'}, inplace=True)
22
+ tempdf.set_index('Language', inplace=True)
23
  tempdf = tempdf[['iso', 'text', tokenizer_name]]
24
  tempdf.columns=['ISO', 'Text', 'Num Tokens']
25
  tempdf.sort_values(by='ISO', inplace=True)
26
  st.session_state.examplesdf = tempdf
27
 
28
+
29
+
30
+
31
  # TODO allow new tokenizers from HF
32
  tokenizer_names_to_test = [
33
  "openai/gpt4",
 
61
  val_data = load_data()
62
  st.success(f'Data loaded: {len(val_data)}')
63
 
64
+ # st.write(val_data.columns, val_data.head())
65
+
66
  with st.expander('Data Source'):
67
  st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
68
 
 
97
  if tokenizer_name in val_data.columns:
98
  subset_df = val_data[val_data.lang.isin(languages)]
99
  subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
100
+
101
+ # st.header(f'Comparing languages for {tokenizer_name}')
102
+
103
+ st.subheader(f'Median Token Length for `{tokenizer_name}`')
104
+ metric_cols = st.columns(len(languages))
105
+ for i, _lang in enumerate(languages):
106
+ metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
107
+
108
+
109
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
110
 
111
  fig.update_layout(
112
+ title=dict(text='Token Distribution', font=dict(size=25), automargin=True, yref='paper', ),
113
+ # title='Distribution of tokens',
114
  xaxis_title="Number of Tokens",
115
  yaxis_title="Density",
116
+ height=500
117
  # title_font_family='"Source Sans Pro", sans-serif'
118
  )
119
  st.plotly_chart(fig, use_container_width=True)
120
 
121
+
122
+
 
 
123
 
124
 
125
  st.subheader('Example Texts')
 
126
  reload_example_text_data()
127
  if st.button("πŸ”„ Randomly sample"):
128
  reload_example_text_data()
 
129
  st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
130
 
131
 
132
+ # val_median_data = val_data.groupby('lang')[tokenizer_name].apply(np.median)
133
+ # val_median_data = val_median_data.sort_values(ascending=False)
134
+ # val_median_data = val_median_data.reset_index()
135
+ # # val_median_data = val_median_data[val_median_data.lang.isin(languages)]
136
+ # val_median_data[tokenizer_name] = val_median_data[tokenizer_name].astype(int)
137
+ # val_median_data.columns = ['Language', 'Median Number of Tokens']
138
+ # # st.write(val_median_data.head())
139
+ # bar_fig = px.bar(
140
+ # val_median_data,
141
+ # y='Language',
142
+ # x='Median Number of Tokens',
143
+ # text_auto='d',
144
+ # orientation='h',
145
+ # hover_data=val_median_data.columns,
146
+ # height=1000,
147
+ # )
148
+ # bar_fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False)
149
+ # bar_fig.update_layout(
150
+ # title=dict(text='Comparison of median token lengths',
151
+ # font=dict(size=20),
152
+ # automargin=True, yref='paper', ),
153
+ # )
154
+ # st.plotly_chart(bar_fig, use_container_width=True)
155
+
156
+
157
+
158
 
159
 
160
 
 
162
 
163
 
164
 
165
+
166
+
167
  with st.expander("About the project"):
168
  st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
169