Spaces:
Running
Running
add doc
Browse files- compression_app.py +10 -6
- compression_util.py +6 -3
compression_app.py
CHANGED
|
@@ -43,12 +43,12 @@ Lossless tokenization preserves the exact original text, i.e. `decoded_text = in
|
|
| 43 |
|
| 44 |
- **Compression Rate** <br>
|
| 45 |
There are mainly two types of metric to represent the `input_text`:
|
| 46 |
-
- `
|
| 47 |
-
- `
|
| 48 |
|
| 49 |
-
To evaluate compression rate, simple metrics can be "how many
|
| 50 |
-
In this leaderboard, we adopt more frequently used metric: "how many
|
| 51 |
-
per
|
| 52 |
💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
|
| 53 |
"""
|
| 54 |
|
|
@@ -141,7 +141,11 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 141 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
| 142 |
)
|
| 143 |
|
| 144 |
-
gr.Markdown("## 🏆 Compression Rate Leaderboard"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
search_bar = gr.Textbox(
|
| 146 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
| 147 |
show_label=False,
|
|
|
|
| 43 |
|
| 44 |
- **Compression Rate** <br>
|
| 45 |
There are mainly two types of metric to represent the `input_text`:
|
| 46 |
+
- `char-level`: the number of characters in the given text
|
| 47 |
+
- `byte-level`: the number of bytes in the given text.
|
| 48 |
|
| 49 |
+
To evaluate compression rate, simple metrics can be "how many chars per token" or "how many bytes per token". <br>
|
| 50 |
+
In this leaderboard, we adopt more frequently used metric: "how many chars per token" and
|
| 51 |
+
"how many billion tokens per gigabytes corpus", i.e. `char/token` and `b_tokens/g_bytes`.
|
| 52 |
💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
|
| 53 |
"""
|
| 54 |
|
|
|
|
| 141 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
| 142 |
)
|
| 143 |
|
| 144 |
+
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
| 145 |
+
"The leaderboard aim to evaluate tokenizer performance on different languages.\n"
|
| 146 |
+
"Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
|
| 147 |
+
"Higher `char/token` means less words be segmented into subwords."
|
| 148 |
+
)
|
| 149 |
search_bar = gr.Textbox(
|
| 150 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
| 151 |
show_label=False,
|
compression_util.py
CHANGED
|
@@ -295,9 +295,12 @@ def get_compression_leaderboard(
|
|
| 295 |
if return_type == "dataframe":
|
| 296 |
token_number_unit, file_size_unit = unit.split("/")
|
| 297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
| 298 |
-
stats = to_dataframe(stats, [
|
| 299 |
-
stats = stats.sort_values(["oov_ratio",
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
| 301 |
return stats
|
| 302 |
|
| 303 |
|
|
|
|
| 295 |
if return_type == "dataframe":
|
| 296 |
token_number_unit, file_size_unit = unit.split("/")
|
| 297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
| 298 |
+
stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
|
| 299 |
+
stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
|
| 300 |
+
|
| 301 |
+
# stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
| 302 |
+
|
| 303 |
+
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
|
| 304 |
return stats
|
| 305 |
|
| 306 |
|