Spaces:
Runtime error
Runtime error
fix tabs
Browse files
app.py
CHANGED
@@ -50,11 +50,10 @@ tokenizer_names_to_test = [
|
|
50 |
|
51 |
with st.sidebar:
|
52 |
|
53 |
-
|
54 |
-
|
55 |
st.markdown(link)
|
56 |
-
|
57 |
-
st.divider()
|
58 |
|
59 |
st.subheader('Tokenizer')
|
60 |
# TODO multi-select tokenizers
|
|
|
50 |
|
51 |
with st.sidebar:
|
52 |
|
53 |
+
st.header('All languages are NOT created (tokenized) equal!')
|
54 |
+
link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
|
55 |
st.markdown(link)
|
56 |
+
st.divider()
|
|
|
57 |
|
58 |
st.subheader('Tokenizer')
|
59 |
# TODO multi-select tokenizers
|