Norod78 commited on
Commit
830833d
1 Parent(s): ced5e66

Add TikToken extention support for the Hebrew Tokenizer

Browse files
tiktoken/tests/test_compare_hebrew.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ test_string = "האיש האחרון עלי אדמות ישב לבד בחדרו, כשלפתע נשמעה דפיקה בדלת"
4
+
5
+ print(f'Test string = "{test_string}"')
6
+
7
+ enc = tiktoken.get_encoding("cl100k_base")
8
+ encoded_text = enc.encode(test_string)
9
+ print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (cl100k_base)')
10
+ decoded_text = enc.decode(encoded_text)
11
+ assert decoded_text == test_string
12
+
13
+ enc = tiktoken.get_encoding("gpt2")
14
+ encoded_text = enc.encode(test_string)
15
+ print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt2)')
16
+ decoded_text = enc.decode(encoded_text)
17
+ assert decoded_text == test_string
18
+
19
+ enc = tiktoken.get_encoding("gpt-hebrew-tokenizer")
20
+ encoded_text = enc.encode(test_string)
21
+ print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt-hebrew-tokenizer)')
22
+ decoded_text = enc.decode(encoded_text)
23
+ assert decoded_text == test_string
tiktoken/tiktoken_ext/tiktoken_ext_norod78_hf.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
2
+
3
+ def gpt_j_hebrew_tokenizer():
4
+ mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
5
+ vocab_bpe_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/merges.txt",
6
+ encoder_json_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/vocab.json",
7
+ )
8
+ return {
9
+ "name": "gpt-j-hebrew-tokenizer",
10
+ "explicit_n_vocab": 50257,
11
+ "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
12
+ "mergeable_ranks": mergeable_ranks,
13
+ "special_tokens": {"<|endoftext|>": 50256},
14
+ }
15
+
16
+ def gpt_hebrew_tokenizer():
17
+ mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
18
+ vocab_bpe_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/merges.txt",
19
+ encoder_json_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/vocab.json",
20
+ )
21
+ return {
22
+ "name": "gpt-hebrew-tokenizer",
23
+ "explicit_n_vocab": 50259,
24
+ "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
25
+ "mergeable_ranks": mergeable_ranks,
26
+ "special_tokens": {"<|endoftext|>": 50256, "<|startoftext|>": 50257, "<|pad|>": 50258},
27
+ }
28
+
29
+ ENCODING_CONSTRUCTORS = {
30
+ "gpt-j-hebrew-tokenizer": gpt_j_hebrew_tokenizer,
31
+ "gpt-hebrew-tokenizer": gpt_hebrew_tokenizer,
32
+ }