xu-song commited on
Commit
171654c
·
1 Parent(s): 2d7d5b2
Files changed (1) hide show
  1. config.py +139 -0
config.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ ## characters
4
+
5
+ - alphanumeric characters
6
+ - numeric characters
7
+ - special characters: A special character is a character that is not an alphabetic or numeric character.
8
+ - ASCII control characters
9
+ - punctuation marks
10
+ - accent marks
11
+ - 数学符号
12
+ - whitespace:
13
+ - https://en.wikipedia.org/wiki/Whitespace_character
14
+ - https://emptycharacter.com/
15
+
16
+
17
+ https://www.computerhope.com/jargon/s/specchar.htm
18
+ """
19
+
20
+ import random
21
+ from datasets import load_dataset
22
+
23
+ default_user_input = """\
24
+ Replace this text in the input field to see how tokenization works.
25
+ Buenos días!
26
+ Tokenizer 是自然语言处理(NLP)中的一个关键组件,它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
27
+ ラグビーワールドカップ2023フランス"""
28
+ # default_tokenizer_name_1 = "Meta/llama3"
29
+ # default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
30
+ default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
31
+ default_tokenizer_name_2 = "openai/gpt-4o"
32
+
33
+
34
+ def get_sample_input():
35
+ default_inputs = {
36
+ "en": "Replace this text in the input field to see how tokenization works.",
37
+ "zh-Hans": "",
38
+ "es": "",
39
+ "de": "",
40
+ }
41
+ random.seed(10) # For reproducibility
42
+ lines = []
43
+ for lang in default_inputs.keys():
44
+ dataset = load_dataset("eson/cc100-samples", lang, split="train")
45
+ print(dataset)
46
+ print(1)
47
+ return default_inputs
48
+
49
+
50
+ examples = {
51
+ "en": [
52
+ ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
53
+ [
54
+ "whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
55
+ "huggyllama/llama-7b",
56
+ "google-bert/bert-base-cased",
57
+ ], # chatglm 有blank_n, bert丢掉了空格,
58
+ # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
59
+ [
60
+ 'punctuation: ,.:/?+=",。!?;【】〔〕〖〗',
61
+ "google/gemma-7b",
62
+ "huggyllama/llama-7b",
63
+ ], # llama词典有点小
64
+ [
65
+ "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
66
+ "baichuan-inc/Baichuan-7B",
67
+ "huggyllama/llama-7b",
68
+ ],
69
+ # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
70
+ ],
71
+ "zh": [
72
+ [
73
+ "空格测试: 2个空格 8个空格",
74
+ "llama",
75
+ "chatglm2_6b",
76
+ ], # chatglm 有blank_n,
77
+ ["标点测试:,。!?;", "baichuan_7b", "llama"],
78
+ [
79
+ "符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
80
+ "baichuan_7b",
81
+ "llama",
82
+ ],
83
+ ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
84
+ ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
85
+ ],
86
+ }
87
+
88
+
89
+ more_examples = [
90
+ # bert系列
91
+ (
92
+ "google-bert/bert-base-cased",
93
+ "google-bert/bert-base-uncased",
94
+ "",
95
+ "",
96
+ ), # # clue VS kplug, bert VS clue
97
+ ("bert-base-cased", "clue", "", "增加了[]()"),
98
+ ("roberta-chinese-clue", "kplug", "", ""),
99
+ # llama系列 (基于sentencepiece)
100
+ (
101
+ "baichuan",
102
+ "baichuan2",
103
+ "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1",
104
+ ),
105
+ ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
106
+ ("llama", "chinese-llama-2-7b", ""),
107
+ ("llama", "llama3", "扩充词典"),
108
+ ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
109
+ # glm系列 (基于sentencepiece)
110
+ ("glm", "chatglm1", ""),
111
+ ("chatglm1", "chatglm2", ""),
112
+ # gpt2系列
113
+ ("gpt2", "moss", ""),
114
+ ("", "", ""),
115
+ # openai系列 (tiktoken)
116
+ ("qwen", "gpt_35_turbo", ""),
117
+ ("gpt4", "gpt-4o", "gpt35_turbo和gpt4 词典大小只有10万,gpt-4o有20万"),
118
+ ]
119
+
120
+ lang = "en"
121
+
122
+ example_types = [t[0].split(":")[0] for t in examples[lang]]
123
+
124
+
125
+ def example_fn(example_idx):
126
+ return examples[lang][example_idx]
127
+
128
+
129
+ def get_more_example():
130
+ import urllib.parse
131
+
132
+ url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
133
+ for tokenizer1, tokenizer2, text, comment in more_examples:
134
+ full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
135
+ print(full_url)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ get_more_example()