Spaces:
Runtime error
Runtime error
File size: 7,608 Bytes
87987fe 40b05d1 8c475b6 87987fe 535e678 62551b2 5943f5c 62551b2 5943f5c 535e678 87987fe 29daff8 8c475b6 87987fe 96bd252 cb5b76e 40b05d1 ba38c3f 96bd252 111053f c1ad373 535e678 c1ad373 535e678 616477f 87987fe 535e678 87987fe 616477f 5943f5c 8641796 616477f 87987fe 62551b2 aaff21f 87987fe 535e678 616477f 535e678 87987fe 29daff8 5943f5c 535e678 87987fe 535e678 5943f5c 535e678 5943f5c 535e678 87987fe 40b05d1 5943f5c 87987fe 616477f 62551b2 26e280e 62551b2 5943f5c 62551b2 5943f5c 96bd252 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import streamlit as st
from collections import defaultdict
import tqdm
import transformers
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random, glob
@st.cache_data
def load_data():
return pd.read_csv('MassiveDatasetValidationData.csv')
def reload_example_text_data():
random_id = random.choice(val_data['id'])
tempdf = subset_df[subset_df['id']==random_id]
tempdf.rename(columns={'lang': 'Language'}, inplace=True)
tempdf.set_index('Language', inplace=True)
tempdf = tempdf[['iso', 'text', tokenizer_name]]
tempdf.columns=['ISO', 'Text', 'Num Tokens']
tempdf.sort_values(by='ISO', inplace=True)
st.session_state.examplesdf = tempdf
# TODO allow new tokenizers from HF
tokenizer_names_to_test = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M", # Facebook
"xlm-roberta-base", # old style
"bert-base-uncased", # old style
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom", # HuggingFace
"StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
"google/flan-t5-base", # Flan T5 (better than T5), Google
"facebook/mbart-large-50", # Facebook
"EleutherAI/gpt-neox-20b", # same as Pythia
]
with st.sidebar:
st.header('All languages are NOT created (tokenized) equal!')
link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese)."
st.markdown(link)
link="This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
st.markdown(link)
st.header('Data Visualization')
st.subheader('Tokenizer')
# TODO multi-select tokenizers
tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
if tokenizer_name not in ['openai/gpt4']:
url = f'https://huggingface.co/{tokenizer_name}'
link = f'Tokenizer is available [on the HuggingFace hub]({url})'
st.markdown(link, unsafe_allow_html=True)
else:
link="Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
st.markdown(link)
st.subheader('Data')
with st.spinner('Loading dataset...'):
val_data = load_data()
st.success(f'Data loaded: {len(val_data)}')
# st.write(val_data.columns, val_data.head())
with st.expander('Data Source'):
st.write("The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)")
st.subheader('Languages')
languages = st.multiselect(
'Select languages',
options=sorted(val_data.lang.unique()),
default=['English', 'Spanish' ,'Chinese', 'Burmese'],
max_selections=6,
label_visibility='collapsed'
)
st.subheader('Figure')
show_hist = st.checkbox('Show histogram', value=False)
# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
# with st.spinner('Loading tokenizer...'):
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
# st.success(f'Tokenizer loaded: {tokenizer_name}')
# # TODO - add the metadata data as well??? later on maybe
# with st.spinner('Calculating tokenization for data...'):
# if tokenizer_name not in val_data.columns:
# val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
# st.success('Completed.')
with st.container():
if tokenizer_name in val_data.columns:
subset_df = val_data[val_data.lang.isin(languages)]
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
# st.header(f'Comparing languages for {tokenizer_name}')
st.subheader(f'Median Token Length for `{tokenizer_name}`')
metric_cols = st.columns(len(languages))
for i, _lang in enumerate(languages):
metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
fig.update_layout(
title=dict(text='Token Distribution', font=dict(size=25), automargin=True, yref='paper', ),
# title='Distribution of tokens',
xaxis_title="Number of Tokens",
yaxis_title="Density",
height=500
# title_font_family='"Source Sans Pro", sans-serif'
)
st.plotly_chart(fig, use_container_width=True)
# Create figures using px.bar
shortest = val_data.groupby('lang')[tokenizer_name].median().sort_values().head(7).reset_index()
shortest["type"] = "shortest"
longest = val_data.groupby('lang')[tokenizer_name].median().sort_values().tail(7).reset_index()
longest["type"] = "longest"
combined = pd.concat([shortest, longest]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
color_sequence = px.colors.qualitative.D3 # You can choose other built-in sequences or define your own
fig = px.bar(combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=color_sequence)
fig.update_traces(hovertemplate='%{y}: %{x} tokens')
fig.update_layout(
title=dict(text='Top Langs with Shortest and Longest Median Token Lengths',
font=dict(size=25), automargin=True, yref='paper', pad=dict(b=20)), # Add more padding below the title
# title='Distribution of tokens',
xaxis=dict(
title="Number of Tokens",
showgrid=True, # Show vertical gridlines
gridwidth=1, # Gridline width
gridcolor='LightGrey' # Gridline color
),
yaxis=dict(
title="",
),
height=400,
showlegend=False # Remove the legend
)
st.plotly_chart(fig, use_container_width=True)
st.subheader('Example Texts')
reload_example_text_data()
if st.button("🔄 Randomly sample"):
reload_example_text_data()
st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
# val_median_data = val_data.groupby('lang')[tokenizer_name].apply(np.median)
# val_median_data = val_median_data.sort_values(ascending=False)
# val_median_data = val_median_data.reset_index()
# # val_median_data = val_median_data[val_median_data.lang.isin(languages)]
# val_median_data[tokenizer_name] = val_median_data[tokenizer_name].astype(int)
# val_median_data.columns = ['Language', 'Median Number of Tokens']
# # st.write(val_median_data.head())
# bar_fig = px.bar(
# val_median_data,
# y='Language',
# x='Median Number of Tokens',
# text_auto='d',
# orientation='h',
# hover_data=val_median_data.columns,
# height=1000,
# )
# bar_fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False)
# bar_fig.update_layout(
# title=dict(text='Comparison of median token lengths',
# font=dict(size=20),
# automargin=True, yref='paper', ),
# )
# st.plotly_chart(bar_fig, use_container_width=True)
|