yenniejun commited on
Commit
616477f
1 Parent(s): fe5c81b

Adding explanation for project

Browse files
Files changed (1) hide show
  1. app.py +22 -6
app.py CHANGED
@@ -33,11 +33,25 @@ with st.sidebar:
33
  # TODO multi-select tokenizers
34
  tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
35
 
36
- st.subheader('Data Source: [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation)')
 
 
 
 
 
 
 
 
 
37
  with st.spinner('Loading dataset...'):
38
  val_data = load_data()
39
  st.success(f'Data loaded: {len(val_data)}')
40
-
 
 
 
 
 
41
  languages = st.multiselect(
42
  'Select languages',
43
  options=sorted(val_data.lang.unique()),
@@ -48,6 +62,9 @@ with st.sidebar:
48
 
49
  st.subheader('Figure')
50
  show_hist = st.checkbox('Show histogram', value=False)
 
 
 
51
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
52
 
53
  # with st.spinner('Loading tokenizer...'):
@@ -82,8 +99,7 @@ with st.container():
82
  for i, _lang in enumerate(languages):
83
  metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
84
 
85
- if tokenizer_name not in ['openai/gpt4']:
86
- url = f'https://huggingface.co/{tokenizer_name}'
87
- link = f'[Find on the HuggingFace hub]({url})'
88
- st.markdown(link, unsafe_allow_html=True)
89
 
 
33
  # TODO multi-select tokenizers
34
  tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
35
 
36
+ if tokenizer_name not in ['openai/gpt4']:
37
+ url = f'https://huggingface.co/{tokenizer_name}'
38
+ link = f'Tokenizer is available [on the HuggingFace hub]({url})'
39
+ st.markdown(link, unsafe_allow_html=True)
40
+ else:
41
+ link="Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
42
+ st.markdown(link)
43
+
44
+
45
+ st.subheader('Data')
46
  with st.spinner('Loading dataset...'):
47
  val_data = load_data()
48
  st.success(f'Data loaded: {len(val_data)}')
49
+
50
+ with st.expander('Data Source: [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation)'):
51
+ st.write("The data in this figure is the validation set of the Amazon Massive dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
52
+
53
+
54
+ st.subheader('Languages')
55
  languages = st.multiselect(
56
  'Select languages',
57
  options=sorted(val_data.lang.unique()),
 
62
 
63
  st.subheader('Figure')
64
  show_hist = st.checkbox('Show histogram', value=False)
65
+
66
+
67
+
68
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
69
 
70
  # with st.spinner('Loading tokenizer...'):
 
99
  for i, _lang in enumerate(languages):
100
  metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
101
 
102
+
103
+ with st.expander("About the project"):
104
+ st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 15-20x more tokens than a comparable message in another language.")
 
105