Add TikToken extention support for the Hebrew Tokenizer
Browse files
tiktoken/tests/test_compare_hebrew.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
|
3 |
+
test_string = "האיש האחרון עלי אדמות ישב לבד בחדרו, כשלפתע נשמעה דפיקה בדלת"
|
4 |
+
|
5 |
+
print(f'Test string = "{test_string}"')
|
6 |
+
|
7 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
8 |
+
encoded_text = enc.encode(test_string)
|
9 |
+
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (cl100k_base)')
|
10 |
+
decoded_text = enc.decode(encoded_text)
|
11 |
+
assert decoded_text == test_string
|
12 |
+
|
13 |
+
enc = tiktoken.get_encoding("gpt2")
|
14 |
+
encoded_text = enc.encode(test_string)
|
15 |
+
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt2)')
|
16 |
+
decoded_text = enc.decode(encoded_text)
|
17 |
+
assert decoded_text == test_string
|
18 |
+
|
19 |
+
enc = tiktoken.get_encoding("gpt-hebrew-tokenizer")
|
20 |
+
encoded_text = enc.encode(test_string)
|
21 |
+
print(f'num of characters = {len(test_string)} encoded length = {len(encoded_text)} (gpt-hebrew-tokenizer)')
|
22 |
+
decoded_text = enc.decode(encoded_text)
|
23 |
+
assert decoded_text == test_string
|
tiktoken/tiktoken_ext/tiktoken_ext_norod78_hf.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
|
2 |
+
|
3 |
+
def gpt_j_hebrew_tokenizer():
|
4 |
+
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
5 |
+
vocab_bpe_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/merges.txt",
|
6 |
+
encoder_json_file="https://huggingface.co/Norod78/gpt-j-hebrew-tokenizer/raw/main/vocab.json",
|
7 |
+
)
|
8 |
+
return {
|
9 |
+
"name": "gpt-j-hebrew-tokenizer",
|
10 |
+
"explicit_n_vocab": 50257,
|
11 |
+
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
12 |
+
"mergeable_ranks": mergeable_ranks,
|
13 |
+
"special_tokens": {"<|endoftext|>": 50256},
|
14 |
+
}
|
15 |
+
|
16 |
+
def gpt_hebrew_tokenizer():
|
17 |
+
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
18 |
+
vocab_bpe_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/merges.txt",
|
19 |
+
encoder_json_file="https://huggingface.co/Norod78/TinyStories-3M-val-Hebrew/raw/main/vocab.json",
|
20 |
+
)
|
21 |
+
return {
|
22 |
+
"name": "gpt-hebrew-tokenizer",
|
23 |
+
"explicit_n_vocab": 50259,
|
24 |
+
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
25 |
+
"mergeable_ranks": mergeable_ranks,
|
26 |
+
"special_tokens": {"<|endoftext|>": 50256, "<|startoftext|>": 50257, "<|pad|>": 50258},
|
27 |
+
}
|
28 |
+
|
29 |
+
ENCODING_CONSTRUCTORS = {
|
30 |
+
"gpt-j-hebrew-tokenizer": gpt_j_hebrew_tokenizer,
|
31 |
+
"gpt-hebrew-tokenizer": gpt_hebrew_tokenizer,
|
32 |
+
}
|