Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on Aug 20

Commit

171654c

1 Parent(s): 2d7d5b2

update

Browse files

Files changed (1) hide show

config.py +139 -0

config.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+## characters
+- alphanumeric characters
+- numeric characters
+- special characters: A special character is a character that is not an alphabetic or numeric character.
+    - ASCII control characters
+    - punctuation marks
+    - accent marks
+    - 数学符号
+    - whitespace:
+        - https://en.wikipedia.org/wiki/Whitespace_character
+        - https://emptycharacter.com/
+https://www.computerhope.com/jargon/s/specchar.htm
+"""
+import random
+from datasets import load_dataset
+default_user_input = """\
+Replace this text in the input field to see how tokenization works.
+Buenos días!
+Tokenizer 是自然语言处理（NLP）中的一个关键组件，它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
+ラグビーワールドカップ2023フランス"""
+# default_tokenizer_name_1 = "Meta/llama3"
+# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
+default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
+default_tokenizer_name_2 = "openai/gpt-4o"
+def get_sample_input():
+    default_inputs = {
+        "en": "Replace this text in the input field to see how tokenization works.",
+        "zh-Hans": "",
+        "es": "",
+        "de": "",
+    }
+    random.seed(10)  # For reproducibility
+    lines = []
+    for lang in default_inputs.keys():
+        dataset = load_dataset("eson/cc100-samples", lang, split="train")
+        print(dataset)
+        print(1)
+    return default_inputs
+examples = {
+    "en": [
+        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
+        [
+            "whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline",
+            "huggyllama/llama-7b",
+            "google-bert/bert-base-cased",
+        ],  # chatglm 有blank_n, bert丢掉了空格，
+        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
+        [
+            'punctuation: ,.:/?+="，。！？；【】〔〕〖〗',
+            "google/gemma-7b",
+            "huggyllama/llama-7b",
+        ],  # llama词典有点小
+        [
+            "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
+            "baichuan-inc/Baichuan-7B",
+            "huggyllama/llama-7b",
+        ],
+        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
+    ],
+    "zh": [
+        [
+            "空格测试：  2个空格        8个空格",
+            "llama",
+            "chatglm2_6b",
+        ],  # chatglm 有blank_n,
+        ["标点测试：，。！？；", "baichuan_7b", "llama"],
+        [
+            "符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
+            "baichuan_7b",
+            "llama",
+        ],
+        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
+        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
+    ],
+}
+more_examples = [
+    # bert系列
+    (
+        "google-bert/bert-base-cased",
+        "google-bert/bert-base-uncased",
+        "",
+        "",
+    ),  # # clue VS kplug， bert VS clue
+    ("bert-base-cased", "clue", "", "增加了[]()"),
+    ("roberta-chinese-clue", "kplug", "", ""),
+    # llama系列 (基于sentencepiece)
+    (
+        "baichuan",
+        "baichuan2",
+        "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1",
+    ),
+    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
+    ("llama", "chinese-llama-2-7b", ""),
+    ("llama", "llama3", "扩充词典"),
+    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
+    # glm系列 （基于sentencepiece）
+    ("glm", "chatglm1", ""),
+    ("chatglm1", "chatglm2", ""),
+    # gpt2系列
+    ("gpt2", "moss", ""),
+    ("", "", ""),
+    # openai系列 （tiktoken）
+    ("qwen", "gpt_35_turbo", ""),
+    ("gpt4", "gpt-4o", "gpt35_turbo和gpt4 词典大小只有10万，gpt-4o有20万"),
+]
+lang = "en"
+example_types = [t[0].split(":")[0] for t in examples[lang]]
+def example_fn(example_idx):
+    return examples[lang][example_idx]
+def get_more_example():
+    import urllib.parse
+    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
+    for tokenizer1, tokenizer2, text, comment in more_examples:
+        full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
+        print(full_url)
+if __name__ == "__main__":
+    get_more_example()