nbroad HF staff commited on
Commit
6601038
1 Parent(s): 8532dcf
README.md CHANGED
@@ -1,3 +1,13 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ # donut-base-ascii
6
+
7
+ This is `"naver-clova-ix/donut-base"` but with all non-ascii tokens removed. This means the model is good for basic English use cases where the text is primarily a-zA-Z0-9 and basic punctuation.
8
+
9
+
10
+ The original model, `"naver-clova-ix/donut-base"`, did not have a token for `"1"`, so that has also been added. The notebook remove-donut-tokens.ipynb details the whole process.
11
+
12
+
13
+ This has not been trained any more than the original model.
config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "a959cf33c20e09215873e338299c900f57047c61",
3
+ "_name_or_path": "naver-clova-ix/donut-base",
4
+ "architectures": [
5
+ "VisionEncoderDecoderModel"
6
+ ],
7
+ "decoder": {
8
+ "_name_or_path": "",
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "gelu",
11
+ "add_cross_attention": true,
12
+ "add_final_layer_norm": true,
13
+ "architectures": null,
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "classifier_dropout": 0.0,
20
+ "cross_attention_hidden_size": null,
21
+ "d_model": 1024,
22
+ "decoder_attention_heads": 16,
23
+ "decoder_ffn_dim": 4096,
24
+ "decoder_layerdrop": 0.0,
25
+ "decoder_layers": 4,
26
+ "decoder_start_token_id": null,
27
+ "diversity_penalty": 0.0,
28
+ "do_sample": false,
29
+ "dropout": 0.1,
30
+ "early_stopping": false,
31
+ "encoder_attention_heads": 16,
32
+ "encoder_ffn_dim": 4096,
33
+ "encoder_layerdrop": 0.0,
34
+ "encoder_layers": 12,
35
+ "encoder_no_repeat_ngram_size": 0,
36
+ "eos_token_id": 2,
37
+ "exponential_decay_length_penalty": null,
38
+ "finetuning_task": null,
39
+ "forced_bos_token_id": null,
40
+ "forced_eos_token_id": 2,
41
+ "id2label": {
42
+ "0": "LABEL_0",
43
+ "1": "LABEL_1"
44
+ },
45
+ "init_std": 0.02,
46
+ "is_decoder": true,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "length_penalty": 1.0,
53
+ "max_length": 20,
54
+ "max_position_embeddings": 1536,
55
+ "min_length": 0,
56
+ "model_type": "mbart",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_beam_groups": 1,
59
+ "num_beams": 1,
60
+ "num_hidden_layers": 12,
61
+ "num_return_sequences": 1,
62
+ "output_attentions": false,
63
+ "output_hidden_states": false,
64
+ "output_scores": false,
65
+ "pad_token_id": 1,
66
+ "prefix": null,
67
+ "problem_type": null,
68
+ "pruned_heads": {},
69
+ "remove_invalid_values": false,
70
+ "repetition_penalty": 1.0,
71
+ "return_dict": true,
72
+ "return_dict_in_generate": false,
73
+ "scale_embedding": true,
74
+ "sep_token_id": null,
75
+ "suppress_tokens": null,
76
+ "task_specific_params": null,
77
+ "temperature": 1.0,
78
+ "tf_legacy_loss": false,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.31.0",
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false,
89
+ "use_cache": true,
90
+ "vocab_size": 27513
91
+ },
92
+ "encoder": {
93
+ "_name_or_path": "",
94
+ "add_cross_attention": false,
95
+ "architectures": null,
96
+ "attention_probs_dropout_prob": 0.0,
97
+ "bad_words_ids": null,
98
+ "begin_suppress_tokens": null,
99
+ "bos_token_id": null,
100
+ "chunk_size_feed_forward": 0,
101
+ "cross_attention_hidden_size": null,
102
+ "decoder_start_token_id": null,
103
+ "depths": [
104
+ 2,
105
+ 2,
106
+ 14,
107
+ 2
108
+ ],
109
+ "diversity_penalty": 0.0,
110
+ "do_sample": false,
111
+ "drop_path_rate": 0.1,
112
+ "early_stopping": false,
113
+ "embed_dim": 128,
114
+ "encoder_no_repeat_ngram_size": 0,
115
+ "eos_token_id": null,
116
+ "exponential_decay_length_penalty": null,
117
+ "finetuning_task": null,
118
+ "forced_bos_token_id": null,
119
+ "forced_eos_token_id": null,
120
+ "hidden_act": "gelu",
121
+ "hidden_dropout_prob": 0.0,
122
+ "hidden_size": 1024,
123
+ "id2label": {
124
+ "0": "LABEL_0",
125
+ "1": "LABEL_1"
126
+ },
127
+ "image_size": [
128
+ 2560,
129
+ 1920
130
+ ],
131
+ "initializer_range": 0.02,
132
+ "is_decoder": false,
133
+ "is_encoder_decoder": false,
134
+ "label2id": {
135
+ "LABEL_0": 0,
136
+ "LABEL_1": 1
137
+ },
138
+ "layer_norm_eps": 1e-05,
139
+ "length_penalty": 1.0,
140
+ "max_length": 20,
141
+ "min_length": 0,
142
+ "mlp_ratio": 4.0,
143
+ "model_type": "donut-swin",
144
+ "no_repeat_ngram_size": 0,
145
+ "num_beam_groups": 1,
146
+ "num_beams": 1,
147
+ "num_channels": 3,
148
+ "num_heads": [
149
+ 4,
150
+ 8,
151
+ 16,
152
+ 32
153
+ ],
154
+ "num_layers": 4,
155
+ "num_return_sequences": 1,
156
+ "output_attentions": false,
157
+ "output_hidden_states": false,
158
+ "output_scores": false,
159
+ "pad_token_id": null,
160
+ "patch_size": 4,
161
+ "path_norm": true,
162
+ "prefix": null,
163
+ "problem_type": null,
164
+ "pruned_heads": {},
165
+ "qkv_bias": true,
166
+ "remove_invalid_values": false,
167
+ "repetition_penalty": 1.0,
168
+ "return_dict": true,
169
+ "return_dict_in_generate": false,
170
+ "sep_token_id": null,
171
+ "suppress_tokens": null,
172
+ "task_specific_params": null,
173
+ "temperature": 1.0,
174
+ "tf_legacy_loss": false,
175
+ "tie_encoder_decoder": false,
176
+ "tie_word_embeddings": true,
177
+ "tokenizer_class": null,
178
+ "top_k": 50,
179
+ "top_p": 1.0,
180
+ "torch_dtype": null,
181
+ "torchscript": false,
182
+ "transformers_version": "4.31.0",
183
+ "typical_p": 1.0,
184
+ "use_absolute_embeddings": false,
185
+ "use_bfloat16": false,
186
+ "window_size": 10
187
+ },
188
+ "is_encoder_decoder": true,
189
+ "model_type": "vision-encoder-decoder",
190
+ "tie_word_embeddings": false,
191
+ "torch_dtype": "float32",
192
+ "transformers_version": null
193
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "forced_eos_token_id": 2,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.31.0"
8
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_align_long_axis": true,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "do_thumbnail": true,
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_processor_type": "DonutImageProcessor",
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "DonutProcessor",
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 2560,
24
+ "width": 1920
25
+ }
26
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcab57be6038f4f02e4b6ad305715c3ef6adb262353424e53e06666228242512
3
+ size 686243033
remove-donut-tokens.ipynb ADDED
@@ -0,0 +1,1314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "id": "IQxLmB8NW6pf"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from transformers import AutoTokenizer\n",
12
+ "\n",
13
+ "model_name = \"naver-clova-ix/donut-base\"\n",
14
+ "\n",
15
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "57525\n"
28
+ ]
29
+ },
30
+ {
31
+ "data": {
32
+ "text/plain": [
33
+ "XLMRobertaTokenizerFast(name_or_path='naver-clova-ix/donut-base', vocab_size=57522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken(\"<mask>\", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['<s_iitcdip>', '<s_synthdog>']}, clean_up_tokenization_spaces=True)"
34
+ ]
35
+ },
36
+ "execution_count": 2,
37
+ "metadata": {},
38
+ "output_type": "execute_result"
39
+ }
40
+ ],
41
+ "source": [
42
+ "tokenizer.save_pretrained(\"old_tokenizer\")\n",
43
+ "\n",
44
+ "print(len(tokenizer))\n",
45
+ "tokenizer"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "markdown",
50
+ "metadata": {
51
+ "id": "Q8tn9ryurY2L"
52
+ },
53
+ "source": [
54
+ "# Modifying the sentencepiece file\n",
55
+ "\n",
56
+ "\n",
57
+ "Reference: https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 3,
63
+ "metadata": {
64
+ "colab": {
65
+ "base_uri": "https://localhost:8080/"
66
+ },
67
+ "id": "HDKf6E35pQ8F",
68
+ "outputId": "2f399f62-7796-463a-b0e1-59ec14357d2c"
69
+ },
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "57520"
75
+ ]
76
+ },
77
+ "execution_count": 3,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "from transformers.convert_slow_tokenizer import import_protobuf\n",
84
+ "\n",
85
+ "model_pb2 = import_protobuf()\n",
86
+ "\n",
87
+ "m = model_pb2.ModelProto()\n",
88
+ "m.ParseFromString(open(\"./old_tokenizer/sentencepiece.bpe.model\", 'rb').read())\n",
89
+ "len(m.pieces)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {
95
+ "id": "elf0xBimspjR"
96
+ },
97
+ "source": [
98
+ "Because m.pieces is a Protocol Buffers field, we can not merely point it to a new list. Instead, we need to use the field’s methods to manipulate its content:"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 4,
104
+ "metadata": {
105
+ "id": "oXfLQmYwsavB"
106
+ },
107
+ "outputs": [],
108
+ "source": [
109
+ "kept_pieces = []\n",
110
+ "\n",
111
+ "\n",
112
+ "for p in m.pieces:\n",
113
+ "\n",
114
+ " # WRITE YOUR OWN RULE FOR WHAT TOKENS TO KEEP\n",
115
+ " if p.piece.lstrip(\"▁\").isascii():\n",
116
+ " kept_pieces.append(p)"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 5,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "i = 0\n",
126
+ "\n",
127
+ "kept_tokens = set([x.piece for x in kept_pieces])\n",
128
+ "\n",
129
+ "# go backwards from end\n",
130
+ "# until at start\n",
131
+ "while i < len(m.pieces):\n",
132
+ " \n",
133
+ " idx = len(m.pieces) - i - 1\n",
134
+ "\n",
135
+ " if m.pieces[idx].piece not in kept_tokens:\n",
136
+ " m.pieces.pop(idx)\n",
137
+ " else:\n",
138
+ " i += 1\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 6,
144
+ "metadata": {},
145
+ "outputs": [
146
+ {
147
+ "data": {
148
+ "text/plain": [
149
+ "27510"
150
+ ]
151
+ },
152
+ "execution_count": 6,
153
+ "metadata": {},
154
+ "output_type": "execute_result"
155
+ }
156
+ ],
157
+ "source": [
158
+ "len(m.pieces)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "markdown",
163
+ "metadata": {},
164
+ "source": [
165
+ "# The Donut tokenizer doesn't have the \"1\" token\n",
166
+ "\n",
167
+ "It has tokens for \" 1\", \"10\", and \"1.1\", but certain scenarios result in the UNK token being used"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 7,
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "3\n"
180
+ ]
181
+ },
182
+ {
183
+ "data": {
184
+ "text/plain": [
185
+ "[0, 56881, 3, 2]"
186
+ ]
187
+ },
188
+ "execution_count": 7,
189
+ "metadata": {},
190
+ "output_type": "execute_result"
191
+ }
192
+ ],
193
+ "source": [
194
+ "print(tokenizer.unk_token_id)\n",
195
+ "\n",
196
+ "# This results in the token turning into an unknown token (3)\n",
197
+ "tokenizer(\">1\").input_ids"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 8,
203
+ "metadata": {},
204
+ "outputs": [
205
+ {
206
+ "data": {
207
+ "text/plain": [
208
+ "[0, 39772, 3, 9447, 3, 54915, 3, 2]"
209
+ ]
210
+ },
211
+ "execution_count": 8,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "# Whenever a character is before the number 1, there is a decent chance the 1 will turn into UNK (id = 3)\n",
218
+ "tokenizer(\"10.1 )1 a1\").input_ids"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "markdown",
223
+ "metadata": {},
224
+ "source": [
225
+ "## Adding 1 into the sentencepiece model"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 9,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "from copy import deepcopy\n",
235
+ "\n",
236
+ "# copy the last piece\n",
237
+ "piece1 = deepcopy(m.pieces[-1])\n",
238
+ "\n",
239
+ "# modify the values of the following variables\n",
240
+ "piece1.piece = \"1\"\n",
241
+ "piece1.score = -10\n",
242
+ "\n",
243
+ "# include it in the models list of pieces\n",
244
+ "m.pieces.extend([piece1])"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 10,
250
+ "metadata": {
251
+ "id": "OrQk2mvZKWg-"
252
+ },
253
+ "outputs": [],
254
+ "source": [
255
+ "# create temporary sentencepiece file\n",
256
+ "\n",
257
+ "with open(\"temp_sentencepiece.bpe.model\", 'wb') as f:\n",
258
+ " f.write(m.SerializeToString())"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": 11,
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "from transformers import XLMRobertaTokenizer\n",
268
+ "\n",
269
+ "new_tokenizer = XLMRobertaTokenizer(vocab_file=\"temp_sentencepiece.bpe.model\")"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 12,
275
+ "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "data": {
279
+ "text/plain": [
280
+ "(27513, 57525)"
281
+ ]
282
+ },
283
+ "execution_count": 12,
284
+ "metadata": {},
285
+ "output_type": "execute_result"
286
+ }
287
+ ],
288
+ "source": [
289
+ "len(new_tokenizer), len(tokenizer)"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 13,
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "data": {
299
+ "text/plain": [
300
+ "('donut-base-ascii/tokenizer_config.json',\n",
301
+ " 'donut-base-ascii/special_tokens_map.json',\n",
302
+ " 'donut-base-ascii/sentencepiece.bpe.model',\n",
303
+ " 'donut-base-ascii/added_tokens.json')"
304
+ ]
305
+ },
306
+ "execution_count": 13,
307
+ "metadata": {},
308
+ "output_type": "execute_result"
309
+ }
310
+ ],
311
+ "source": [
312
+ "# the special tokens are in the model, but due to a quirk, they need to be added again\n",
313
+ "\n",
314
+ "new_tokenizer.add_special_tokens(new_tokenizer.special_tokens_map)\n",
315
+ "\n",
316
+ "new_tokenizer.save_pretrained('donut-base-ascii')"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 14,
322
+ "metadata": {},
323
+ "outputs": [
324
+ {
325
+ "data": {
326
+ "text/plain": [
327
+ "(27513, 57525)"
328
+ ]
329
+ },
330
+ "execution_count": 14,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
+ }
334
+ ],
335
+ "source": [
336
+ "len(new_tokenizer), len(tokenizer)"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": 15,
342
+ "metadata": {},
343
+ "outputs": [],
344
+ "source": [
345
+ "# reload to get all features\n",
346
+ "\n",
347
+ "new_tokenizer = AutoTokenizer.from_pretrained(\"donut-base-ascii\")"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 16,
353
+ "metadata": {},
354
+ "outputs": [],
355
+ "source": [
356
+ "old_mapping = tokenizer.vocab\n",
357
+ "\n",
358
+ "new_mapping = new_tokenizer.vocab\n",
359
+ "\n",
360
+ "sorted_new_mapping = sorted(new_mapping.items(), key=lambda x: x[1])# sort by id, ascending\n",
361
+ "\n",
362
+ "# `embed_indexes` will have the old index value stored at the new index\n",
363
+ "# e.g. embed_indexes[i] = j means the new embedding id at i has the same value\n",
364
+ "# as the old embedding id of j\n",
365
+ "embed_indexes = [old_mapping[tok] for tok, _ in sorted_new_mapping[:-2]]"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 17,
371
+ "metadata": {},
372
+ "outputs": [
373
+ {
374
+ "data": {
375
+ "text/plain": [
376
+ "[('1', 27511), ('<mask>', 27512)]"
377
+ ]
378
+ },
379
+ "execution_count": 17,
380
+ "metadata": {},
381
+ "output_type": "execute_result"
382
+ }
383
+ ],
384
+ "source": [
385
+ "# embed_indexes ignores the last two because\n",
386
+ "# the second to last one is brand new.\n",
387
+ "\n",
388
+ "# these two embeddings will get added later\n",
389
+ "sorted_new_mapping[-2:]"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": 26,
395
+ "metadata": {},
396
+ "outputs": [
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "torch.Size([27511, 1024])\n"
402
+ ]
403
+ }
404
+ ],
405
+ "source": [
406
+ "from transformers import VisionEncoderDecoderModel\n",
407
+ "\n",
408
+ "model_name = \"naver-clova-ix/donut-base\"\n",
409
+ "model = VisionEncoderDecoderModel.from_pretrained(model_name)\n",
410
+ "\n",
411
+ "old_embeds = model.decoder.model.decoder.embed_tokens.weight.data\n",
412
+ "old_embeds\n",
413
+ "\n",
414
+ "new_embeds = old_embeds[embed_indexes, :].clone()\n",
415
+ "\n",
416
+ "print(new_embeds.shape)"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 19,
422
+ "metadata": {},
423
+ "outputs": [
424
+ {
425
+ "name": "stdout",
426
+ "output_type": "stream",
427
+ "text": [
428
+ "torch.Size([1024])\n",
429
+ "torch.Size([1024])\n"
430
+ ]
431
+ },
432
+ {
433
+ "data": {
434
+ "text/plain": [
435
+ "torch.Size([27513, 1024])"
436
+ ]
437
+ },
438
+ "execution_count": 19,
439
+ "metadata": {},
440
+ "output_type": "execute_result"
441
+ }
442
+ ],
443
+ "source": [
444
+ "import torch\n",
445
+ "\n",
446
+ "# setting the embedding for the new token to be the same as \" 1\"\n",
447
+ "# during training, they will differentiate\n",
448
+ "embed_1 = old_embeds[old_mapping[\"▁1\"]].clone()\n",
449
+ "print(embed_1.shape)\n",
450
+ "\n",
451
+ "embed_mask = old_embeds[old_mapping[\"<mask>\"]].clone()\n",
452
+ "print(embed_mask.shape)\n",
453
+ "\n",
454
+ "new_embeds = torch.vstack([new_embeds, embed_1.unsqueeze(0), embed_mask.unsqueeze(0)])\n",
455
+ "\n",
456
+ "new_embeds.shape"
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "markdown",
461
+ "metadata": {},
462
+ "source": [
463
+ "## Put embeddings back into model"
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "execution_count": 20,
469
+ "metadata": {},
470
+ "outputs": [],
471
+ "source": [
472
+ "model.decoder.model.decoder.embed_tokens.weight.data = new_embeds\n",
473
+ "\n",
474
+ "model.decoder.config.update({\n",
475
+ " \"vocab_size\": new_embeds.shape[0]\n",
476
+ "})\n",
477
+ "\n",
478
+ "model.save_pretrained(\"donut-base-ascii\")"
479
+ ]
480
+ },
481
+ {
482
+ "cell_type": "markdown",
483
+ "metadata": {},
484
+ "source": [
485
+ "# Making sure the embeddings are correct"
486
+ ]
487
+ },
488
+ {
489
+ "cell_type": "code",
490
+ "execution_count": 21,
491
+ "metadata": {},
492
+ "outputs": [
493
+ {
494
+ "name": "stdout",
495
+ "output_type": "stream",
496
+ "text": [
497
+ "[0, 37199, 35816, 34554, 2]\n",
498
+ "[0, 14026, 13045, 12147, 2]\n"
499
+ ]
500
+ }
501
+ ],
502
+ "source": [
503
+ "old_ids = tokenizer(\"hello there\").input_ids\n",
504
+ "print(old_ids)\n",
505
+ "\n",
506
+ "new_ids = new_tokenizer(\"hello there\").input_ids\n",
507
+ "print(new_ids)"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": 22,
513
+ "metadata": {},
514
+ "outputs": [
515
+ {
516
+ "data": {
517
+ "text/plain": [
518
+ "tensor(True)"
519
+ ]
520
+ },
521
+ "execution_count": 22,
522
+ "metadata": {},
523
+ "output_type": "execute_result"
524
+ }
525
+ ],
526
+ "source": [
527
+ "import torch\n",
528
+ "\n",
529
+ "old_embeddings = torch.stack([old_embeds[i] for i in old_ids])\n",
530
+ "new_embeddings = torch.stack([new_embeds[i] for i in new_ids])\n",
531
+ "\n",
532
+ "torch.all(torch.eq(old_embeddings, new_embeddings))"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "markdown",
537
+ "metadata": {},
538
+ "source": [
539
+ "## Add image processor so that all files are together"
540
+ ]
541
+ },
542
+ {
543
+ "cell_type": "code",
544
+ "execution_count": 27,
545
+ "metadata": {},
546
+ "outputs": [
547
+ {
548
+ "name": "stderr",
549
+ "output_type": "stream",
550
+ "text": [
551
+ "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.\n"
552
+ ]
553
+ },
554
+ {
555
+ "data": {
556
+ "text/plain": [
557
+ "['donut-base-ascii/preprocessor_config.json']"
558
+ ]
559
+ },
560
+ "execution_count": 27,
561
+ "metadata": {},
562
+ "output_type": "execute_result"
563
+ }
564
+ ],
565
+ "source": [
566
+ "from transformers import AutoImageProcessor\n",
567
+ "\n",
568
+ "proc = AutoImageProcessor.from_pretrained(model_name)\n",
569
+ "proc.save_pretrained(\"donut-base-ascii\")"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "markdown",
574
+ "metadata": {},
575
+ "source": [
576
+ "## Check that the new token for 1 works\n",
577
+ "\n",
578
+ "\n",
579
+ "unk_token_id = 3, so that shouldn't be present! Instead it should have 27511, the new token for \"1\""
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 24,
585
+ "metadata": {},
586
+ "outputs": [
587
+ {
588
+ "data": {
589
+ "text/plain": [
590
+ "[0, 15793, 27511, 4056, 27511, 26020, 27511, 2]"
591
+ ]
592
+ },
593
+ "execution_count": 24,
594
+ "metadata": {},
595
+ "output_type": "execute_result"
596
+ }
597
+ ],
598
+ "source": [
599
+ "new_tokenizer(\"10.1 )1 a1\").input_ids"
600
+ ]
601
+ }
602
+ ],
603
+ "metadata": {
604
+ "colab": {
605
+ "provenance": []
606
+ },
607
+ "kernelspec": {
608
+ "display_name": "Python 3",
609
+ "name": "python3"
610
+ },
611
+ "language_info": {
612
+ "codemirror_mode": {
613
+ "name": "ipython",
614
+ "version": 3
615
+ },
616
+ "file_extension": ".py",
617
+ "mimetype": "text/x-python",
618
+ "name": "python",
619
+ "nbconvert_exporter": "python",
620
+ "pygments_lexer": "ipython3",
621
+ "version": "3.10.10"
622
+ },
623
+ "widgets": {
624
+ "application/vnd.jupyter.widget-state+json": {
625
+ "0199dce34d0b4101ab2da9cd761f17ea": {
626
+ "model_module": "@jupyter-widgets/controls",
627
+ "model_module_version": "1.5.0",
628
+ "model_name": "HTMLModel",
629
+ "state": {
630
+ "_dom_classes": [],
631
+ "_model_module": "@jupyter-widgets/controls",
632
+ "_model_module_version": "1.5.0",
633
+ "_model_name": "HTMLModel",
634
+ "_view_count": null,
635
+ "_view_module": "@jupyter-widgets/controls",
636
+ "_view_module_version": "1.5.0",
637
+ "_view_name": "HTMLView",
638
+ "description": "",
639
+ "description_tooltip": null,
640
+ "layout": "IPY_MODEL_9ac895ad7a3d4af4b75076fc1e8433ac",
641
+ "placeholder": "​",
642
+ "style": "IPY_MODEL_f97ecaf1a1af41029bb2d79334e83b3d",
643
+ "value": " 4.74k/4.74k [00:00&lt;00:00, 230kB/s]"
644
+ }
645
+ },
646
+ "01d989190ff34c499ef4eb023a982a13": {
647
+ "model_module": "@jupyter-widgets/controls",
648
+ "model_module_version": "1.5.0",
649
+ "model_name": "DescriptionStyleModel",
650
+ "state": {
651
+ "_model_module": "@jupyter-widgets/controls",
652
+ "_model_module_version": "1.5.0",
653
+ "_model_name": "DescriptionStyleModel",
654
+ "_view_count": null,
655
+ "_view_module": "@jupyter-widgets/base",
656
+ "_view_module_version": "1.2.0",
657
+ "_view_name": "StyleView",
658
+ "description_width": ""
659
+ }
660
+ },
661
+ "3dcafb3def654095a3a02644eb1b79b6": {
662
+ "model_module": "@jupyter-widgets/controls",
663
+ "model_module_version": "1.5.0",
664
+ "model_name": "ProgressStyleModel",
665
+ "state": {
666
+ "_model_module": "@jupyter-widgets/controls",
667
+ "_model_module_version": "1.5.0",
668
+ "_model_name": "ProgressStyleModel",
669
+ "_view_count": null,
670
+ "_view_module": "@jupyter-widgets/base",
671
+ "_view_module_version": "1.2.0",
672
+ "_view_name": "StyleView",
673
+ "bar_color": null,
674
+ "description_width": ""
675
+ }
676
+ },
677
+ "493c720b2da54790a6b8e3ec0ee44f8d": {
678
+ "model_module": "@jupyter-widgets/base",
679
+ "model_module_version": "1.2.0",
680
+ "model_name": "LayoutModel",
681
+ "state": {
682
+ "_model_module": "@jupyter-widgets/base",
683
+ "_model_module_version": "1.2.0",
684
+ "_model_name": "LayoutModel",
685
+ "_view_count": null,
686
+ "_view_module": "@jupyter-widgets/base",
687
+ "_view_module_version": "1.2.0",
688
+ "_view_name": "LayoutView",
689
+ "align_content": null,
690
+ "align_items": null,
691
+ "align_self": null,
692
+ "border": null,
693
+ "bottom": null,
694
+ "display": null,
695
+ "flex": null,
696
+ "flex_flow": null,
697
+ "grid_area": null,
698
+ "grid_auto_columns": null,
699
+ "grid_auto_flow": null,
700
+ "grid_auto_rows": null,
701
+ "grid_column": null,
702
+ "grid_gap": null,
703
+ "grid_row": null,
704
+ "grid_template_areas": null,
705
+ "grid_template_columns": null,
706
+ "grid_template_rows": null,
707
+ "height": null,
708
+ "justify_content": null,
709
+ "justify_items": null,
710
+ "left": null,
711
+ "margin": null,
712
+ "max_height": null,
713
+ "max_width": null,
714
+ "min_height": null,
715
+ "min_width": null,
716
+ "object_fit": null,
717
+ "object_position": null,
718
+ "order": null,
719
+ "overflow": null,
720
+ "overflow_x": null,
721
+ "overflow_y": null,
722
+ "padding": null,
723
+ "right": null,
724
+ "top": null,
725
+ "visibility": null,
726
+ "width": null
727
+ }
728
+ },
729
+ "4aa596c990844b06b1081f373235cbe9": {
730
+ "model_module": "@jupyter-widgets/controls",
731
+ "model_module_version": "1.5.0",
732
+ "model_name": "FloatProgressModel",
733
+ "state": {
734
+ "_dom_classes": [],
735
+ "_model_module": "@jupyter-widgets/controls",
736
+ "_model_module_version": "1.5.0",
737
+ "_model_name": "FloatProgressModel",
738
+ "_view_count": null,
739
+ "_view_module": "@jupyter-widgets/controls",
740
+ "_view_module_version": "1.5.0",
741
+ "_view_name": "ProgressView",
742
+ "bar_style": "success",
743
+ "description": "",
744
+ "description_tooltip": null,
745
+ "layout": "IPY_MODEL_493c720b2da54790a6b8e3ec0ee44f8d",
746
+ "max": 4742,
747
+ "min": 0,
748
+ "orientation": "horizontal",
749
+ "style": "IPY_MODEL_3dcafb3def654095a3a02644eb1b79b6",
750
+ "value": 4742
751
+ }
752
+ },
753
+ "66b76b6edce045b480ebed513ba1ab6e": {
754
+ "model_module": "@jupyter-widgets/base",
755
+ "model_module_version": "1.2.0",
756
+ "model_name": "LayoutModel",
757
+ "state": {
758
+ "_model_module": "@jupyter-widgets/base",
759
+ "_model_module_version": "1.2.0",
760
+ "_model_name": "LayoutModel",
761
+ "_view_count": null,
762
+ "_view_module": "@jupyter-widgets/base",
763
+ "_view_module_version": "1.2.0",
764
+ "_view_name": "LayoutView",
765
+ "align_content": null,
766
+ "align_items": null,
767
+ "align_self": null,
768
+ "border": null,
769
+ "bottom": null,
770
+ "display": null,
771
+ "flex": null,
772
+ "flex_flow": null,
773
+ "grid_area": null,
774
+ "grid_auto_columns": null,
775
+ "grid_auto_flow": null,
776
+ "grid_auto_rows": null,
777
+ "grid_column": null,
778
+ "grid_gap": null,
779
+ "grid_row": null,
780
+ "grid_template_areas": null,
781
+ "grid_template_columns": null,
782
+ "grid_template_rows": null,
783
+ "height": null,
784
+ "justify_content": null,
785
+ "justify_items": null,
786
+ "left": null,
787
+ "margin": null,
788
+ "max_height": null,
789
+ "max_width": null,
790
+ "min_height": null,
791
+ "min_width": null,
792
+ "object_fit": null,
793
+ "object_position": null,
794
+ "order": null,
795
+ "overflow": null,
796
+ "overflow_x": null,
797
+ "overflow_y": null,
798
+ "padding": null,
799
+ "right": null,
800
+ "top": null,
801
+ "visibility": null,
802
+ "width": null
803
+ }
804
+ },
805
+ "6b432819bf504227a04a10a749a848e9": {
806
+ "model_module": "@jupyter-widgets/base",
807
+ "model_module_version": "1.2.0",
808
+ "model_name": "LayoutModel",
809
+ "state": {
810
+ "_model_module": "@jupyter-widgets/base",
811
+ "_model_module_version": "1.2.0",
812
+ "_model_name": "LayoutModel",
813
+ "_view_count": null,
814
+ "_view_module": "@jupyter-widgets/base",
815
+ "_view_module_version": "1.2.0",
816
+ "_view_name": "LayoutView",
817
+ "align_content": null,
818
+ "align_items": null,
819
+ "align_self": null,
820
+ "border": null,
821
+ "bottom": null,
822
+ "display": null,
823
+ "flex": null,
824
+ "flex_flow": null,
825
+ "grid_area": null,
826
+ "grid_auto_columns": null,
827
+ "grid_auto_flow": null,
828
+ "grid_auto_rows": null,
829
+ "grid_column": null,
830
+ "grid_gap": null,
831
+ "grid_row": null,
832
+ "grid_template_areas": null,
833
+ "grid_template_columns": null,
834
+ "grid_template_rows": null,
835
+ "height": null,
836
+ "justify_content": null,
837
+ "justify_items": null,
838
+ "left": null,
839
+ "margin": null,
840
+ "max_height": null,
841
+ "max_width": null,
842
+ "min_height": null,
843
+ "min_width": null,
844
+ "object_fit": null,
845
+ "object_position": null,
846
+ "order": null,
847
+ "overflow": null,
848
+ "overflow_x": null,
849
+ "overflow_y": null,
850
+ "padding": null,
851
+ "right": null,
852
+ "top": null,
853
+ "visibility": null,
854
+ "width": null
855
+ }
856
+ },
857
+ "6e87b5a1db834af09c4507881ce12fd8": {
858
+ "model_module": "@jupyter-widgets/controls",
859
+ "model_module_version": "1.5.0",
860
+ "model_name": "DescriptionStyleModel",
861
+ "state": {
862
+ "_model_module": "@jupyter-widgets/controls",
863
+ "_model_module_version": "1.5.0",
864
+ "_model_name": "DescriptionStyleModel",
865
+ "_view_count": null,
866
+ "_view_module": "@jupyter-widgets/base",
867
+ "_view_module_version": "1.2.0",
868
+ "_view_name": "StyleView",
869
+ "description_width": ""
870
+ }
871
+ },
872
+ "70ec1c59c5c34b448d91ad895137b7c0": {
873
+ "model_module": "@jupyter-widgets/controls",
874
+ "model_module_version": "1.5.0",
875
+ "model_name": "ProgressStyleModel",
876
+ "state": {
877
+ "_model_module": "@jupyter-widgets/controls",
878
+ "_model_module_version": "1.5.0",
879
+ "_model_name": "ProgressStyleModel",
880
+ "_view_count": null,
881
+ "_view_module": "@jupyter-widgets/base",
882
+ "_view_module_version": "1.2.0",
883
+ "_view_name": "StyleView",
884
+ "bar_color": null,
885
+ "description_width": ""
886
+ }
887
+ },
888
+ "77506fb9c6404b74a5f8d82fa323a275": {
889
+ "model_module": "@jupyter-widgets/controls",
890
+ "model_module_version": "1.5.0",
891
+ "model_name": "HTMLModel",
892
+ "state": {
893
+ "_dom_classes": [],
894
+ "_model_module": "@jupyter-widgets/controls",
895
+ "_model_module_version": "1.5.0",
896
+ "_model_name": "HTMLModel",
897
+ "_view_count": null,
898
+ "_view_module": "@jupyter-widgets/controls",
899
+ "_view_module_version": "1.5.0",
900
+ "_view_name": "HTMLView",
901
+ "description": "",
902
+ "description_tooltip": null,
903
+ "layout": "IPY_MODEL_a9f316334ef9440a979c332a0ae8e7cd",
904
+ "placeholder": "​",
905
+ "style": "IPY_MODEL_01d989190ff34c499ef4eb023a982a13",
906
+ "value": "Downloading pytorch_model.bin: 100%"
907
+ }
908
+ },
909
+ "78760960021e43ce85e63f08c55b821d": {
910
+ "model_module": "@jupyter-widgets/controls",
911
+ "model_module_version": "1.5.0",
912
+ "model_name": "HBoxModel",
913
+ "state": {
914
+ "_dom_classes": [],
915
+ "_model_module": "@jupyter-widgets/controls",
916
+ "_model_module_version": "1.5.0",
917
+ "_model_name": "HBoxModel",
918
+ "_view_count": null,
919
+ "_view_module": "@jupyter-widgets/controls",
920
+ "_view_module_version": "1.5.0",
921
+ "_view_name": "HBoxView",
922
+ "box_style": "",
923
+ "children": [
924
+ "IPY_MODEL_8dd9d54bf1a3499c9d075d7dd34e8f3a",
925
+ "IPY_MODEL_4aa596c990844b06b1081f373235cbe9",
926
+ "IPY_MODEL_0199dce34d0b4101ab2da9cd761f17ea"
927
+ ],
928
+ "layout": "IPY_MODEL_66b76b6edce045b480ebed513ba1ab6e"
929
+ }
930
+ },
931
+ "80ce735e6b314dcb92fab111b26a43d6": {
932
+ "model_module": "@jupyter-widgets/base",
933
+ "model_module_version": "1.2.0",
934
+ "model_name": "LayoutModel",
935
+ "state": {
936
+ "_model_module": "@jupyter-widgets/base",
937
+ "_model_module_version": "1.2.0",
938
+ "_model_name": "LayoutModel",
939
+ "_view_count": null,
940
+ "_view_module": "@jupyter-widgets/base",
941
+ "_view_module_version": "1.2.0",
942
+ "_view_name": "LayoutView",
943
+ "align_content": null,
944
+ "align_items": null,
945
+ "align_self": null,
946
+ "border": null,
947
+ "bottom": null,
948
+ "display": null,
949
+ "flex": null,
950
+ "flex_flow": null,
951
+ "grid_area": null,
952
+ "grid_auto_columns": null,
953
+ "grid_auto_flow": null,
954
+ "grid_auto_rows": null,
955
+ "grid_column": null,
956
+ "grid_gap": null,
957
+ "grid_row": null,
958
+ "grid_template_areas": null,
959
+ "grid_template_columns": null,
960
+ "grid_template_rows": null,
961
+ "height": null,
962
+ "justify_content": null,
963
+ "justify_items": null,
964
+ "left": null,
965
+ "margin": null,
966
+ "max_height": null,
967
+ "max_width": null,
968
+ "min_height": null,
969
+ "min_width": null,
970
+ "object_fit": null,
971
+ "object_position": null,
972
+ "order": null,
973
+ "overflow": null,
974
+ "overflow_x": null,
975
+ "overflow_y": null,
976
+ "padding": null,
977
+ "right": null,
978
+ "top": null,
979
+ "visibility": null,
980
+ "width": null
981
+ }
982
+ },
983
+ "824c138272c94f4086f9035f97b082c3": {
984
+ "model_module": "@jupyter-widgets/controls",
985
+ "model_module_version": "1.5.0",
986
+ "model_name": "FloatProgressModel",
987
+ "state": {
988
+ "_dom_classes": [],
989
+ "_model_module": "@jupyter-widgets/controls",
990
+ "_model_module_version": "1.5.0",
991
+ "_model_name": "FloatProgressModel",
992
+ "_view_count": null,
993
+ "_view_module": "@jupyter-widgets/controls",
994
+ "_view_module_version": "1.5.0",
995
+ "_view_name": "ProgressView",
996
+ "bar_style": "success",
997
+ "description": "",
998
+ "description_tooltip": null,
999
+ "layout": "IPY_MODEL_cd5a87bcb60a44e194b3db834f200061",
1000
+ "max": 809168699,
1001
+ "min": 0,
1002
+ "orientation": "horizontal",
1003
+ "style": "IPY_MODEL_70ec1c59c5c34b448d91ad895137b7c0",
1004
+ "value": 809168699
1005
+ }
1006
+ },
1007
+ "8dd9d54bf1a3499c9d075d7dd34e8f3a": {
1008
+ "model_module": "@jupyter-widgets/controls",
1009
+ "model_module_version": "1.5.0",
1010
+ "model_name": "HTMLModel",
1011
+ "state": {
1012
+ "_dom_classes": [],
1013
+ "_model_module": "@jupyter-widgets/controls",
1014
+ "_model_module_version": "1.5.0",
1015
+ "_model_name": "HTMLModel",
1016
+ "_view_count": null,
1017
+ "_view_module": "@jupyter-widgets/controls",
1018
+ "_view_module_version": "1.5.0",
1019
+ "_view_name": "HTMLView",
1020
+ "description": "",
1021
+ "description_tooltip": null,
1022
+ "layout": "IPY_MODEL_f98d3c18e9454ed2be3197330e5d84b1",
1023
+ "placeholder": "​",
1024
+ "style": "IPY_MODEL_6e87b5a1db834af09c4507881ce12fd8",
1025
+ "value": "Downloading (…)lve/main/config.json: 100%"
1026
+ }
1027
+ },
1028
+ "9ac895ad7a3d4af4b75076fc1e8433ac": {
1029
+ "model_module": "@jupyter-widgets/base",
1030
+ "model_module_version": "1.2.0",
1031
+ "model_name": "LayoutModel",
1032
+ "state": {
1033
+ "_model_module": "@jupyter-widgets/base",
1034
+ "_model_module_version": "1.2.0",
1035
+ "_model_name": "LayoutModel",
1036
+ "_view_count": null,
1037
+ "_view_module": "@jupyter-widgets/base",
1038
+ "_view_module_version": "1.2.0",
1039
+ "_view_name": "LayoutView",
1040
+ "align_content": null,
1041
+ "align_items": null,
1042
+ "align_self": null,
1043
+ "border": null,
1044
+ "bottom": null,
1045
+ "display": null,
1046
+ "flex": null,
1047
+ "flex_flow": null,
1048
+ "grid_area": null,
1049
+ "grid_auto_columns": null,
1050
+ "grid_auto_flow": null,
1051
+ "grid_auto_rows": null,
1052
+ "grid_column": null,
1053
+ "grid_gap": null,
1054
+ "grid_row": null,
1055
+ "grid_template_areas": null,
1056
+ "grid_template_columns": null,
1057
+ "grid_template_rows": null,
1058
+ "height": null,
1059
+ "justify_content": null,
1060
+ "justify_items": null,
1061
+ "left": null,
1062
+ "margin": null,
1063
+ "max_height": null,
1064
+ "max_width": null,
1065
+ "min_height": null,
1066
+ "min_width": null,
1067
+ "object_fit": null,
1068
+ "object_position": null,
1069
+ "order": null,
1070
+ "overflow": null,
1071
+ "overflow_x": null,
1072
+ "overflow_y": null,
1073
+ "padding": null,
1074
+ "right": null,
1075
+ "top": null,
1076
+ "visibility": null,
1077
+ "width": null
1078
+ }
1079
+ },
1080
+ "a9f316334ef9440a979c332a0ae8e7cd": {
1081
+ "model_module": "@jupyter-widgets/base",
1082
+ "model_module_version": "1.2.0",
1083
+ "model_name": "LayoutModel",
1084
+ "state": {
1085
+ "_model_module": "@jupyter-widgets/base",
1086
+ "_model_module_version": "1.2.0",
1087
+ "_model_name": "LayoutModel",
1088
+ "_view_count": null,
1089
+ "_view_module": "@jupyter-widgets/base",
1090
+ "_view_module_version": "1.2.0",
1091
+ "_view_name": "LayoutView",
1092
+ "align_content": null,
1093
+ "align_items": null,
1094
+ "align_self": null,
1095
+ "border": null,
1096
+ "bottom": null,
1097
+ "display": null,
1098
+ "flex": null,
1099
+ "flex_flow": null,
1100
+ "grid_area": null,
1101
+ "grid_auto_columns": null,
1102
+ "grid_auto_flow": null,
1103
+ "grid_auto_rows": null,
1104
+ "grid_column": null,
1105
+ "grid_gap": null,
1106
+ "grid_row": null,
1107
+ "grid_template_areas": null,
1108
+ "grid_template_columns": null,
1109
+ "grid_template_rows": null,
1110
+ "height": null,
1111
+ "justify_content": null,
1112
+ "justify_items": null,
1113
+ "left": null,
1114
+ "margin": null,
1115
+ "max_height": null,
1116
+ "max_width": null,
1117
+ "min_height": null,
1118
+ "min_width": null,
1119
+ "object_fit": null,
1120
+ "object_position": null,
1121
+ "order": null,
1122
+ "overflow": null,
1123
+ "overflow_x": null,
1124
+ "overflow_y": null,
1125
+ "padding": null,
1126
+ "right": null,
1127
+ "top": null,
1128
+ "visibility": null,
1129
+ "width": null
1130
+ }
1131
+ },
1132
+ "c35a984617774c8fbde92917bcae872e": {
1133
+ "model_module": "@jupyter-widgets/controls",
1134
+ "model_module_version": "1.5.0",
1135
+ "model_name": "HBoxModel",
1136
+ "state": {
1137
+ "_dom_classes": [],
1138
+ "_model_module": "@jupyter-widgets/controls",
1139
+ "_model_module_version": "1.5.0",
1140
+ "_model_name": "HBoxModel",
1141
+ "_view_count": null,
1142
+ "_view_module": "@jupyter-widgets/controls",
1143
+ "_view_module_version": "1.5.0",
1144
+ "_view_name": "HBoxView",
1145
+ "box_style": "",
1146
+ "children": [
1147
+ "IPY_MODEL_77506fb9c6404b74a5f8d82fa323a275",
1148
+ "IPY_MODEL_824c138272c94f4086f9035f97b082c3",
1149
+ "IPY_MODEL_e0dedfb1d27d4b1aa4c090477d985257"
1150
+ ],
1151
+ "layout": "IPY_MODEL_6b432819bf504227a04a10a749a848e9"
1152
+ }
1153
+ },
1154
+ "c53794a14c3049d193260cffca0a6aaa": {
1155
+ "model_module": "@jupyter-widgets/controls",
1156
+ "model_module_version": "1.5.0",
1157
+ "model_name": "DescriptionStyleModel",
1158
+ "state": {
1159
+ "_model_module": "@jupyter-widgets/controls",
1160
+ "_model_module_version": "1.5.0",
1161
+ "_model_name": "DescriptionStyleModel",
1162
+ "_view_count": null,
1163
+ "_view_module": "@jupyter-widgets/base",
1164
+ "_view_module_version": "1.2.0",
1165
+ "_view_name": "StyleView",
1166
+ "description_width": ""
1167
+ }
1168
+ },
1169
+ "cd5a87bcb60a44e194b3db834f200061": {
1170
+ "model_module": "@jupyter-widgets/base",
1171
+ "model_module_version": "1.2.0",
1172
+ "model_name": "LayoutModel",
1173
+ "state": {
1174
+ "_model_module": "@jupyter-widgets/base",
1175
+ "_model_module_version": "1.2.0",
1176
+ "_model_name": "LayoutModel",
1177
+ "_view_count": null,
1178
+ "_view_module": "@jupyter-widgets/base",
1179
+ "_view_module_version": "1.2.0",
1180
+ "_view_name": "LayoutView",
1181
+ "align_content": null,
1182
+ "align_items": null,
1183
+ "align_self": null,
1184
+ "border": null,
1185
+ "bottom": null,
1186
+ "display": null,
1187
+ "flex": null,
1188
+ "flex_flow": null,
1189
+ "grid_area": null,
1190
+ "grid_auto_columns": null,
1191
+ "grid_auto_flow": null,
1192
+ "grid_auto_rows": null,
1193
+ "grid_column": null,
1194
+ "grid_gap": null,
1195
+ "grid_row": null,
1196
+ "grid_template_areas": null,
1197
+ "grid_template_columns": null,
1198
+ "grid_template_rows": null,
1199
+ "height": null,
1200
+ "justify_content": null,
1201
+ "justify_items": null,
1202
+ "left": null,
1203
+ "margin": null,
1204
+ "max_height": null,
1205
+ "max_width": null,
1206
+ "min_height": null,
1207
+ "min_width": null,
1208
+ "object_fit": null,
1209
+ "object_position": null,
1210
+ "order": null,
1211
+ "overflow": null,
1212
+ "overflow_x": null,
1213
+ "overflow_y": null,
1214
+ "padding": null,
1215
+ "right": null,
1216
+ "top": null,
1217
+ "visibility": null,
1218
+ "width": null
1219
+ }
1220
+ },
1221
+ "e0dedfb1d27d4b1aa4c090477d985257": {
1222
+ "model_module": "@jupyter-widgets/controls",
1223
+ "model_module_version": "1.5.0",
1224
+ "model_name": "HTMLModel",
1225
+ "state": {
1226
+ "_dom_classes": [],
1227
+ "_model_module": "@jupyter-widgets/controls",
1228
+ "_model_module_version": "1.5.0",
1229
+ "_model_name": "HTMLModel",
1230
+ "_view_count": null,
1231
+ "_view_module": "@jupyter-widgets/controls",
1232
+ "_view_module_version": "1.5.0",
1233
+ "_view_name": "HTMLView",
1234
+ "description": "",
1235
+ "description_tooltip": null,
1236
+ "layout": "IPY_MODEL_80ce735e6b314dcb92fab111b26a43d6",
1237
+ "placeholder": "​",
1238
+ "style": "IPY_MODEL_c53794a14c3049d193260cffca0a6aaa",
1239
+ "value": " 809M/809M [00:07&lt;00:00, 116MB/s]"
1240
+ }
1241
+ },
1242
+ "f97ecaf1a1af41029bb2d79334e83b3d": {
1243
+ "model_module": "@jupyter-widgets/controls",
1244
+ "model_module_version": "1.5.0",
1245
+ "model_name": "DescriptionStyleModel",
1246
+ "state": {
1247
+ "_model_module": "@jupyter-widgets/controls",
1248
+ "_model_module_version": "1.5.0",
1249
+ "_model_name": "DescriptionStyleModel",
1250
+ "_view_count": null,
1251
+ "_view_module": "@jupyter-widgets/base",
1252
+ "_view_module_version": "1.2.0",
1253
+ "_view_name": "StyleView",
1254
+ "description_width": ""
1255
+ }
1256
+ },
1257
+ "f98d3c18e9454ed2be3197330e5d84b1": {
1258
+ "model_module": "@jupyter-widgets/base",
1259
+ "model_module_version": "1.2.0",
1260
+ "model_name": "LayoutModel",
1261
+ "state": {
1262
+ "_model_module": "@jupyter-widgets/base",
1263
+ "_model_module_version": "1.2.0",
1264
+ "_model_name": "LayoutModel",
1265
+ "_view_count": null,
1266
+ "_view_module": "@jupyter-widgets/base",
1267
+ "_view_module_version": "1.2.0",
1268
+ "_view_name": "LayoutView",
1269
+ "align_content": null,
1270
+ "align_items": null,
1271
+ "align_self": null,
1272
+ "border": null,
1273
+ "bottom": null,
1274
+ "display": null,
1275
+ "flex": null,
1276
+ "flex_flow": null,
1277
+ "grid_area": null,
1278
+ "grid_auto_columns": null,
1279
+ "grid_auto_flow": null,
1280
+ "grid_auto_rows": null,
1281
+ "grid_column": null,
1282
+ "grid_gap": null,
1283
+ "grid_row": null,
1284
+ "grid_template_areas": null,
1285
+ "grid_template_columns": null,
1286
+ "grid_template_rows": null,
1287
+ "height": null,
1288
+ "justify_content": null,
1289
+ "justify_items": null,
1290
+ "left": null,
1291
+ "margin": null,
1292
+ "max_height": null,
1293
+ "max_width": null,
1294
+ "min_height": null,
1295
+ "min_width": null,
1296
+ "object_fit": null,
1297
+ "object_position": null,
1298
+ "order": null,
1299
+ "overflow": null,
1300
+ "overflow_x": null,
1301
+ "overflow_y": null,
1302
+ "padding": null,
1303
+ "right": null,
1304
+ "top": null,
1305
+ "visibility": null,
1306
+ "width": null
1307
+ }
1308
+ }
1309
+ }
1310
+ }
1311
+ },
1312
+ "nbformat": 4,
1313
+ "nbformat_minor": 0
1314
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2a67edafddbb4e6a16de0e91b4b87c76cddf14b88f5db874ba628ca6813717
3
+ size 719223
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 1000000000000000019884624838656,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "sp_model_kwargs": {},
18
+ "tokenizer_class": "XLMRobertaTokenizer",
19
+ "unk_token": "<unk>"
20
+ }