Vu Minh Chien commited on
Commit
5fe9736
·
1 Parent(s): 3c6b4f9

Add model files

Browse files
Fine-Tune-Wav2Vec2-Large-XLSR-Japan.ipynb ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "language_info": {
4
+ "codemirror_mode": {
5
+ "name": "ipython",
6
+ "version": 3
7
+ },
8
+ "file_extension": ".py",
9
+ "mimetype": "text/x-python",
10
+ "name": "python",
11
+ "nbconvert_exporter": "python",
12
+ "pygments_lexer": "ipython3",
13
+ "version": 3
14
+ },
15
+ "orig_nbformat": 2
16
+ },
17
+ "nbformat": 4,
18
+ "nbformat_minor": 2,
19
+ "cells": [
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "%%capture\n",
27
+ "!pip install datasets==1.4.1\n",
28
+ "!pip install transformers==4.4.0\n",
29
+ "!pip install torchaudio\n",
30
+ "!pip install librosa\n",
31
+ "!pip install jiwer\n",
32
+ "!pip install mecab-python3\n",
33
+ "!pip install unidic-lite\n",
34
+ "!pip isntall audiomentations"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor\n",
44
+ "from datasets import load_dataset, load_metric, ClassLabel, Dataset\n",
45
+ "from audiomentations import Compose, AddGaussianNoise, Gain, PitchShift, TimeStretch, Shift\n",
46
+ "from torch.optim.lr_scheduler import LambdaLR\n",
47
+ "from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer\n",
48
+ "\n",
49
+ "import pandas as pd\n",
50
+ "import numpy as np\n",
51
+ "import soundfile as sf\n",
52
+ "import re\n",
53
+ "import json\n",
54
+ "import torchaudio\n",
55
+ "import librosa\n",
56
+ "import datasets\n",
57
+ "import MeCab\n",
58
+ "import pykakasi\n",
59
+ "import random\n",
60
+ "\n",
61
+ "import torch\n",
62
+ "from dataclasses import dataclass, field\n",
63
+ "from typing import Any, Dict, List, Optional, Union"
64
+ ]
65
+ },
66
+ {
67
+ "source": [
68
+ "# Load dataset and prepare processor"
69
+ ],
70
+ "cell_type": "markdown",
71
+ "metadata": {}
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "# Load public dataset from University of Tokyo\n",
80
+ "!wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip\n",
81
+ "!unzip jsut_ver1.1.zip\n",
82
+ "\n",
83
+ "path = 'jsut_ver1.1/basic5000/'\n",
84
+ "df = pd.read_csv(path + 'transcript_utf8.txt', header = None, delimiter = \":\", names=[\"path\", \"sentence\"], index_col=False)\n",
85
+ "df[\"path\"] = df[\"path\"].map(lambda x: path + 'wav/' + x + \".wav\")\n",
86
+ "df.head()\n",
87
+ "\n",
88
+ "jsut_voice_train = Dataset.from_pandas(df)"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": null,
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "# Import training dataset\n",
98
+ "common_voice_train = load_dataset('common_voice', 'ja',split='train+validation')\n",
99
+ "common_voice_test = load_dataset('common_voice', 'ja', split='test')\n",
100
+ "\n",
101
+ "# Remove unwanted columns\n",
102
+ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
103
+ "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
104
+ "\n",
105
+ "# Concat common voice and public dataset\n",
106
+ "common_voice_train = datasets.concatenate_datasets([jsut_voice_train, common_voice_train])"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "# Parser Japanese sentence. Ex: \"pythonが大好きです\" -> \"python が 大好き です EOS\"\n",
116
+ "wakati = MeCab.Tagger(\"-Owakati\")\n",
117
+ "\n",
118
+ "# Unwanted token\n",
119
+ "chars_to_ignore_regex = '[\\,\\、\\。\\.\\「\\」\\…\\?\\・]'\n",
120
+ "\n",
121
+ "def remove_special_characters(batch):\n",
122
+ " batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
123
+ " batch[\"sentence\"] = re.sub(chars_to_ignore_regex,'', batch[\"sentence\"]).strip()\n",
124
+ " return batch\n",
125
+ "\n",
126
+ "common_voice_train = common_voice_train.map(remove_special_characters)\n",
127
+ "common_voice_test = common_voice_test.map(remove_special_characters)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "# make vocab file\n",
137
+ "def extract_all_chars(batch):\n",
138
+ " all_text = \" \".join(batch[\"sentence\"])\n",
139
+ " vocab = list(set(all_text))\n",
140
+ " return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
141
+ "\n",
142
+ "# make vocab list and text\n",
143
+ "vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
144
+ "vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)\n",
145
+ "\n",
146
+ "# concate vocab from train and test set\n",
147
+ "vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))\n",
148
+ "vocab_dict = {v: k for k, v in enumerate(vocab_list)}\n",
149
+ "print(len(vocab_dict))\n",
150
+ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
151
+ "del vocab_dict[\" \"]\n",
152
+ "\n",
153
+ "# create unk and pad token\n",
154
+ "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
155
+ "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
156
+ "\n",
157
+ "# save to json file\n",
158
+ "with open('vocab.json', 'w') as vocab_file:\n",
159
+ " json.dump(vocab_dict, vocab_file, indent=2, ensure_ascii=False)"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "save_dir = \"./output_models\"\n",
169
+ "# wrap tokenizer and feature extractor to processor\n",
170
+ "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab_demo.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
171
+ "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)\n",
172
+ "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)\n",
173
+ "processor.save_pretrained(save_dir)"
174
+ ]
175
+ },
176
+ {
177
+ "source": [
178
+ "# Prepare train and test dataset "
179
+ ],
180
+ "cell_type": "markdown",
181
+ "metadata": {}
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": [
189
+ "# convert audio from 48kHz to 16kHz (standard sample rate of wave2vec model)\n",
190
+ "def speech_file_to_array_fn(batch):\n",
191
+ " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n",
192
+ " batch[\"speech\"] = librosa.resample(np.asarray(speech_array[0].numpy()), 48_000, 16_000)\n",
193
+ " batch[\"sampling_rate\"] = 16_000\n",
194
+ " batch[\"target_text\"] = batch[\"sentence\"]\n",
195
+ " return batch\n",
196
+ "\n",
197
+ "common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names,num_proc=4)\n",
198
+ "common_voice_test = common_voice_test.map(speech_file_to_array_fn,remove_columns=common_voice_test.column_names, num_proc=4) "
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "metadata": {},
205
+ "outputs": [],
206
+ "source": [
207
+ "# do augment to enrich common voice dataset \n",
208
+ "augment = Compose([\n",
209
+ " AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.001, p=0.8),\n",
210
+ " PitchShift(min_semitones=-1, max_semitones=1, p=0.8),\n",
211
+ " Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8),\n",
212
+ " TimeStretch(min_rate=0.8, max_rate=1.25, p=0.8)\n",
213
+ "\n",
214
+ "])\n",
215
+ "\n",
216
+ "def augmented_speech(batch, augment):\n",
217
+ " samples = np.array(batch[\"speech\"])\n",
218
+ " batch[\"speech\"] = augment(samples=samples, sample_rate=16000)\n",
219
+ " batch[\"sampling_rate\"] = 16_000\n",
220
+ " batch[\"target_text\"] = batch[\"target_text\"]\n",
221
+ " return batch\n",
222
+ "\n",
223
+ "# augument 50% of trainset\n",
224
+ "common_voice_train_augmented = common_voice_train.train_test_split(test_size = 0.5)['train']\n",
225
+ "common_voice_train_augmented = common_voice_train_augmented.map(lambda batch: augmented_speech(batch, augment), num_proc=4)\n",
226
+ "\n",
227
+ "# concate with trainset\n",
228
+ "common_voice_train = datasets.concatenate_datasets([common_voice_train_augmented, common_voice_train])"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "def prepare_dataset(batch):\n",
238
+ " # check that all files have the correct sampling rate\n",
239
+ " assert (\n",
240
+ " len(set(batch[\"sampling_rate\"])) == 1\n",
241
+ " ), f\"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}.\"\n",
242
+ "\n",
243
+ " batch[\"input_values\"] = processor(batch[\"speech\"], sampling_rate=batch[\"sampling_rate\"][0]).input_values\n",
244
+ " \n",
245
+ " with processor.as_target_processor():\n",
246
+ " batch[\"labels\"] = processor(batch[\"target_text\"]).input_ids\n",
247
+ " return batch\n",
248
+ " \n",
249
+ "# prepare dataset\n",
250
+ "common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)\n",
251
+ "common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)"
252
+ ]
253
+ },
254
+ {
255
+ "source": [
256
+ "# Training"
257
+ ],
258
+ "cell_type": "markdown",
259
+ "metadata": {}
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": null,
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# create data collator\n",
268
+ "@dataclass\n",
269
+ "class DataCollatorCTCWithPadding:\n",
270
+ "\n",
271
+ " processor: Wav2Vec2Processor\n",
272
+ " padding: Union[bool, str] = True\n",
273
+ " max_length: Optional[int] = None\n",
274
+ " max_length_labels: Optional[int] = None\n",
275
+ " pad_to_multiple_of: Optional[int] = None\n",
276
+ " pad_to_multiple_of_labels: Optional[int] = None\n",
277
+ "\n",
278
+ " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
279
+ " input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
280
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
281
+ "\n",
282
+ " batch = self.processor.pad(\n",
283
+ " input_features,\n",
284
+ " padding=self.padding,\n",
285
+ " max_length=self.max_length,\n",
286
+ " pad_to_multiple_of=self.pad_to_multiple_of,\n",
287
+ " return_tensors=\"pt\",\n",
288
+ " )\n",
289
+ " with self.processor.as_target_processor():\n",
290
+ " labels_batch = self.processor.pad(\n",
291
+ " label_features,\n",
292
+ " padding=self.padding,\n",
293
+ " max_length=self.max_length_labels,\n",
294
+ " pad_to_multiple_of=self.pad_to_multiple_of_labels,\n",
295
+ " return_tensors=\"pt\",\n",
296
+ " )\n",
297
+ "\n",
298
+ " # replace padding with -100 to ignore loss correctly\n",
299
+ " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
300
+ "\n",
301
+ " batch[\"labels\"] = labels\n",
302
+ "\n",
303
+ " return batch\n",
304
+ "\n",
305
+ "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": null,
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "# make metric function\n",
315
+ "wer_metric = load_metric(\"wer\")\n",
316
+ "\n",
317
+ "def compute_metrics(pred):\n",
318
+ " pred_logits = pred.predictions\n",
319
+ " pred_ids = np.argmax(pred_logits, axis=-1)\n",
320
+ "\n",
321
+ " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
322
+ "\n",
323
+ " pred_str = processor.batch_decode(pred_ids)\n",
324
+ " # we do not want to group tokens when computing the metrics\n",
325
+ " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
326
+ "\n",
327
+ " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
328
+ "\n",
329
+ " return {\"wer\": wer}"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": null,
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "# create custom learning scheduler\n",
339
+ "\n",
340
+ "# polynomial decay\n",
341
+ "def get_polynomial_decay_schedule_with_warmup(\n",
342
+ " optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.2, last_epoch=-1\n",
343
+ "):\n",
344
+ "\n",
345
+ " lr_init = optimizer.defaults[\"lr\"]\n",
346
+ " assert lr_init > lr_end, f\"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})\"\n",
347
+ "\n",
348
+ " def lr_lambda(current_step: int):\n",
349
+ " if current_step < num_warmup_steps:\n",
350
+ " return float(current_step) / float(max(1, num_warmup_steps))\n",
351
+ " elif current_step > num_training_steps:\n",
352
+ " return lr_end / lr_init # as LambdaLR multiplies by lr_init\n",
353
+ " else:\n",
354
+ " lr_range = lr_init - lr_end\n",
355
+ " decay_steps = num_training_steps - num_warmup_steps\n",
356
+ " pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps\n",
357
+ " decay = lr_range * pct_remaining ** power + lr_end\n",
358
+ " return decay / lr_init # as LambdaLR multiplies by lr_init\n",
359
+ "\n",
360
+ " return LambdaLR(optimizer, lr_lambda, last_epoch)\n",
361
+ " \n",
362
+ "# wrap custom learning scheduler with trainer\n",
363
+ "class PolyTrainer(Trainer):\n",
364
+ " def __init__(self, *args, **kwargs):\n",
365
+ " super().__init__(*args, **kwargs)\n",
366
+ " \n",
367
+ " def create_scheduler(self, num_training_steps: int):\n",
368
+ " self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(self.optimizer, \n",
369
+ " num_warmup_steps=self.args.warmup_steps,\n",
370
+ " num_training_steps=num_training_steps)\n",
371
+ " def create_optimizer_and_scheduler(self, num_training_steps: int):\n",
372
+ " self.create_optimizer()\n",
373
+ " self.create_scheduler(num_training_steps)"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": null,
379
+ "metadata": {},
380
+ "outputs": [],
381
+ "source": [
382
+ "# load pretrain model\n",
383
+ "model = Wav2Vec2ForCTC.from_pretrained(\n",
384
+ " \"facebook/wav2vec2-large-xlsr-53\", \n",
385
+ " attention_dropout=0.1,\n",
386
+ " hidden_dropout=0.1,\n",
387
+ " feat_proj_dropout=0.1,\n",
388
+ " mask_time_prob=0.1, \n",
389
+ " layerdrop=0.1,\n",
390
+ " gradient_checkpointing=True, \n",
391
+ " ctc_loss_reduction=\"mean\", \n",
392
+ " pad_token_id=processor.tokenizer.pad_token_id,\n",
393
+ " vocab_size=len(processor.tokenizer)\n",
394
+ ")\n",
395
+ "# free feature extractor\n",
396
+ "model.freeze_feature_extractor()\n",
397
+ "\n",
398
+ "# define train argument\n",
399
+ "training_args = TrainingArguments(\n",
400
+ " output_dir=save_dir,\n",
401
+ " group_by_length=True,\n",
402
+ " per_device_train_batch_size=32,\n",
403
+ " gradient_accumulation_steps=2,\n",
404
+ " evaluation_strategy=\"steps\",\n",
405
+ " num_train_epochs=200,\n",
406
+ " fp16=True,\n",
407
+ " save_steps=2400, \n",
408
+ " eval_steps=800,\n",
409
+ " logging_steps=800, \n",
410
+ " learning_rate=1e-4, \n",
411
+ " warmup_steps=1500, \n",
412
+ " save_total_limit=2,\n",
413
+ " load_best_model_at_end = True, \n",
414
+ " metric_for_best_model='wer', \n",
415
+ " greater_is_better=False\n",
416
+ ")\n",
417
+ "\n",
418
+ "# wrap everything to Trainer\n",
419
+ "trainer = PolyTrainer(\n",
420
+ " model=model,\n",
421
+ " data_collator=data_collator,\n",
422
+ " args=training_args,\n",
423
+ " compute_metrics=compute_metrics,\n",
424
+ " train_dataset=common_voice_train,\n",
425
+ " eval_dataset=common_voice_test,\n",
426
+ " tokenizer=processor.feature_extractor,\n",
427
+ ")"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": null,
433
+ "metadata": {},
434
+ "outputs": [],
435
+ "source": [
436
+ "# training\n",
437
+ "train_result = trainer.train()"
438
+ ]
439
+ },
440
+ {
441
+ "source": [
442
+ "# Testing result"
443
+ ],
444
+ "cell_type": "markdown",
445
+ "metadata": {}
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": null,
450
+ "metadata": {},
451
+ "outputs": [],
452
+ "source": [
453
+ "import torch\n",
454
+ "import torchaudio\n",
455
+ "from datasets import load_dataset, load_metric\n",
456
+ "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
457
+ "import MeCab\n",
458
+ "import pykakasi\n",
459
+ "import re\n",
460
+ "\n",
461
+ "#config\n",
462
+ "wakati = MeCab.Tagger(\"-Owakati\")\n",
463
+ "chars_to_ignore_regex = '[\\,\\、\\。\\.\\「\\」\\…\\?\\・]'\n",
464
+ "\n",
465
+ "#load model\n",
466
+ "processor = Wav2Vec2Processor.from_pretrained(save_dir)\n",
467
+ "test_model = Wav2Vec2ForCTC.from_pretrained(save_dir)\n",
468
+ "test_model.to(\"cuda\")\n",
469
+ "resampler = torchaudio.transforms.Resample(48_000, 16_000)\n",
470
+ "\n",
471
+ "#load testdata\n",
472
+ "test_dataset = load_dataset(\"common_voice\", \"ja\", split=\"test\")\n",
473
+ "wer = load_metric(\"wer\")\n",
474
+ "\n",
475
+ "# Preprocessing the datasets.\n",
476
+ "def speech_file_to_array_fn(batch):\n",
477
+ " batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
478
+ " batch[\"sentence\"] = re.sub(chars_to_ignore_regex,'', batch[\"sentence\"]).strip()\n",
479
+ " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n",
480
+ " batch[\"speech\"] = resampler(speech_array).squeeze().numpy()\n",
481
+ " return batch\n",
482
+ "\n",
483
+ "test_dataset = test_dataset.map(speech_file_to_array_fn)\n",
484
+ "\n",
485
+ "# Preprocessing the datasets.\n",
486
+ "# We need to read the aduio files as arrays\n",
487
+ "def evaluate(batch):\n",
488
+ " inputs = processor(batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n",
489
+ "\n",
490
+ " with torch.no_grad():\n",
491
+ " logits = test_model(inputs.input_values.to(\"cuda\"), attention_mask=inputs.attention_mask.to(\"cuda\")).logits\n",
492
+ " pred_ids = torch.argmax(logits, dim=-1)\n",
493
+ " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
494
+ " return batch\n",
495
+ "\n",
496
+ "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
497
+ "\n",
498
+ "print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": null,
504
+ "metadata": {},
505
+ "outputs": [],
506
+ "source": [
507
+ "# print some reusults\n",
508
+ "pick = random.randint(0, len(common_voice_test_transcription)-1)\n",
509
+ "input_dict = processor(common_voice_test[\"input_values\"][pick], return_tensors=\"pt\", padding=True)\n",
510
+ "logits = test_model(input_dict.input_values.to(\"cuda\")).logits\n",
511
+ "pred_ids = torch.argmax(logits, dim=-1)[0]\n",
512
+ "\n",
513
+ "print(\"Prediction:\")\n",
514
+ "print(processor.decode(pred_ids).strip())\n",
515
+ "\n",
516
+ "print(\"\\nLabel:\")\n",
517
+ "print(processor.decode(common_voice_test['labels'][pick]))\n"
518
+ ]
519
+ }
520
+ ]
521
+ }
README.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ datasets:
4
+ - common_voice
5
+ metrics:
6
+ - wer
7
+ tags:
8
+ - audio
9
+ - automatic-speech-recognition
10
+ - speech
11
+ - xlsr-fine-tuning-week
12
+ license: apache-2.0
13
+ model-index:
14
+ - name: XLSR Wav2Vec2 Japanese by Chien Vu
15
+ results:
16
+ - task:
17
+ name: Speech Recognition
18
+ type: automatic-speech-recognition
19
+ dataset:
20
+ name: Common Voice Japanese
21
+ type: common_voice
22
+ args: ja
23
+ metrics:
24
+ - name: Test WER
25
+ type: wer
26
+ value: 46.77
27
+ ---
28
+ # Wav2Vec2-Large-XLSR-53-Japanese
29
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Japanese using the [Common Voice](https://huggingface.co/datasets/common_voice) and Japanese speech corpus of Saruwatari-lab, University of Tokyo [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut).
30
+ When using this model, make sure that your speech input is sampled at 16kHz.
31
+ ## Usage
32
+ The model can be used directly (without a language model) as follows:
33
+ ```python
34
+ import torch
35
+ import torchaudio
36
+ import librosa
37
+ from datasets import load_dataset
38
+ import MeCab
39
+ import pykakasi
40
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
41
+
42
+ # config
43
+ wakati = MeCab.Tagger("-Owakati")
44
+ chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'
45
+
46
+ # load data, processor and model
47
+ test_dataset = load_dataset("common_voice", "ja", split="test[:2%]")
48
+ processor = Wav2Vec2Processor.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
49
+ model = Wav2Vec2ForCTC.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
50
+ resampler = lambda sr, y: librosa.resample(y.numpy().squeeze(), sr, 16_000)
51
+
52
+ # Preprocessing the datasets.
53
+ def speech_file_to_array_fn(batch):
54
+ batch["sentence"] = wakati.parse(batch["sentence"]).strip()
55
+ batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
56
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
57
+ batch["speech"] = resampler(sampling_rate, speech_array).squeeze()
58
+ return batch
59
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
60
+ inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
61
+ with torch.no_grad():
62
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
+ predicted_ids = torch.argmax(logits, dim=-1)
64
+ print("Prediction:", processor.batch_decode(predicted_ids))
65
+ print("Reference:", test_dataset["sentence"][:2])
66
+ ```
67
+ ## Evaluation
68
+ The model can be evaluated as follows on the Arabic test data of Common Voice.
69
+ ```python
70
+ import torch
71
+ import librosa
72
+ import torchaudio
73
+ from datasets import load_dataset
74
+ import MeCab
75
+ import pykakasi
76
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
77
+
78
+ #config
79
+ wakati = MeCab.Tagger("-Owakati")
80
+ chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'
81
+
82
+ # load data, processor and model
83
+ test_dataset = load_dataset("common_voice", "ja", split="test")
84
+ wer = load_metric("wer")
85
+ processor = Wav2Vec2Processor.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
86
+ model = Wav2Vec2ForCTC.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
87
+ model.to("cuda")
88
+ resampler = lambda sr, y: librosa.resample(y.numpy().squeeze(), sr, 16_000)
89
+
90
+ # Preprocessing the datasets.
91
+ def speech_file_to_array_fn(batch):
92
+ batch["sentence"] = kakasi.do(wakati.parse(batch["sentence"]).strip())
93
+ batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
94
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
95
+ batch["speech"] = resampler(sampling_rate, speech_array).squeeze()
96
+ return batch
97
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
98
+
99
+ # evaluate function
100
+ def evaluate(batch):
101
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
102
+ with torch.no_grad():
103
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
104
+ pred_ids = torch.argmax(logits, dim=-1)
105
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
106
+ return batch
107
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
108
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
109
+ ```
110
+ **Test Result**: 46.77
111
+ ## Training
112
+ The Common Voice `train`, `validation` datasets and Japanese speech corpus `basic5000` datasets were used for training.
113
+ The script used for training can be found [here](Fine-Tune-Wav2Vec2-Large-XLSR-Japan.ipynb)
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.1,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.1,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 2698,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 2699
76
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f20aa1dd1aea1e481a5e87034fe4733dabfb16d6c44096dfc55ecef8fd0777c
3
+ size 1272999703
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540a9cb029ba4c351fdb2719b51eb95f3c1683ee51cb25588127285fcfe59420
3
+ size 2351
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ソ": 0, "消": 1, "講": 2, "核": 3, "撓": 4, "圏": 5, "挽": 6, "鬱": 7, "瓢": 8, "証": 9, "煩": 10, "ヌ": 11, "童": 12, "少": 13, "疹": 14, "昭": 15, "躇": 16, "歴": 17, "淫": 18, "謡": 19, "葺": 20, "炒": 21, "神": 22, "軸": 23, "爬": 24, "談": 25, "趙": 26, "暗": 27, "鳴": 28, "弄": 29, "窯": 30, "御": 31, "全": 32, "雹": 33, "孺": 34, "班": 35, "抱": 36, "源": 37, "摯": 38, "雄": 39, "豚": 40, "焜": 41, "岩": 42, "法": 43, "泰": 44, "役": 45, "智": 46, "避": 47, "省": 48, "巾": 49, "家": 50, "癒": 51, "抵": 52, "想": 53, "環": 54, "襞": 55, "将": 56, "僅": 57, "宣": 58, "妨": 59, "醜": 60, "充": 61, "亡": 62, "犠": 63, "戯": 64, "首": 65, "や": 66, "剥": 67, "ど": 68, "茶": 69, "斗": 70, "艶": 71, "人": 72, "彗": 73, "填": 74, "絵": 75, "草": 76, "曖": 77, "謔": 78, "斧": 79, "サ": 80, "我": 81, "裏": 82, "働": 83, "濯": 84, "秋": 85, "殻": 86, "動": 87, "旧": 88, "像": 89, "謀": 90, "ワ": 91, "苗": 92, "塔": 93, "嘲": 94, "挨": 95, "猛": 96, "両": 97, "掃": 98, "蓄": 99, "ュ": 100, "啄": 101, "襦": 102, "憬": 103, "真": 104, "存": 105, "較": 106, "焚": 107, "偽": 108, "凌": 109, "蹙": 110, "尻": 111, "識": 112, "句": 113, "奥": 114, "崖": 115, "急": 116, "往": 117, "漑": 118, "詰": 119, "拶": 120, "勾": 121, "和": 122, "礁": 123, "謹": 124, "貴": 125, "膚": 126, "痼": 127, "装": 128, "梁": 129, "胃": 130, "髷": 131, "除": 132, "幽": 133, "薬": 134, "痒": 135, "列": 136, "庫": 137, "栃": 138, "頷": 139, "ぱ": 140, "僚": 141, "湯": 142, "死": 143, "亭": 144, "額": 145, "頒": 146, "婆": 147, "任": 148, "等": 149, "複": 150, "奮": 151, "訓": 152, "練": 153, "視": 154, "津": 155, "凶": 156, "方": 157, "性": 158, "格": 159, "覗": 160, "岳": 161, "妹": 162, "夏": 163, "祓": 164, "被": 165, "弱": 166, "闇": 167, "暑": 168, "隅": 169, "曾": 170, "畜": 171, "指": 172, "昔": 173, "橈": 174, "親": 175, "録": 176, "ー": 177, "刊": 178, "帰": 179, "律": 180, "遅": 181, "店": 182, "零": 183, "職": 184, "魅": 185, "巧": 186, "樽": 187, "立": 188, "裔": 189, "歩": 190, "析": 191, "俵": 192, "工": 193, "渠": 194, "支": 195, "勝": 196, "宝": 197, "ド": 198, "ゲ": 199, "聚": 200, "睡": 201, "壇": 202, "肥": 203, "天": 204, "廉": 205, "の": 206, "設": 207, "震": 208, "四": 209, "愚": 210, "錆": 211, "爵": 212, "至": 213, "痢": 214, "炊": 215, "イ": 216, "卑": 217, "槌": 218, "交": 219, "介": 220, "禎": 221, "敗": 222, "旬": 223, "呂": 224, "郎": 225, "吐": 226, "蛭": 227, "俊": 228, "悲": 229, "囚": 230, "衆": 231, "唾": 232, "円": 233, "廓": 234, "ピ": 235, "恩": 236, "啓": 237, "選": 238, "精": 239, "掘": 240, "鶏": 241, "佳": 242, "陰": 243, "毒": 244, "俗": 245, "然": 246, "殖": 247, "呼": 248, "賈": 249, "ざ": 250, "寝": 251, "野": 252, "妥": 253, "詞": 254, "嗣": 255, "熏": 256, "替": 257, "孤": 258, "解": 259, "報": 260, "羨": 261, "す": 262, "縄": 263, "枷": 264, "陥": 265, "極": 266, "踏": 267, "縫": 268, "淋": 269, "忘": 270, "悩": 271, "知": 272, "ジ": 273, "思": 274, "猜": 275, "効": 276, "並": 277, "傷": 278, "が": 279, "唆": 280, "章": 281, "岬": 282, "測": 283, "堤": 284, "私": 285, "如": 286, "庭": 287, "寮": 288, "ザ": 289, "扇": 290, "質": 291, "ャ": 292, "線": 293, "喩": 294, "邑": 295, "区": 296, "登": 297, "賓": 298, "座": 299, "底": 300, "林": 301, "返": 302, "共": 303, "髴": 304, "奏": 305, "鎧": 306, "牢": 307, "硬": 308, "嬢": 309, "決": 310, "携": 311, "糖": 312, "樹": 313, "箇": 314, "漫": 315, "碑": 316, "楓": 317, "攻": 318, "絞": 319, "珍": 320, "呉": 321, "叛": 322, "上": 323, "割": 324, "菊": 325, "儒": 326, "近": 327, "履": 328, "駆": 329, "了": 330, "賦": 331, "檻": 332, "唇": 333, "男": 334, "ゃ": 335, "翠": 336, "墨": 337, "酎": 338, "溜": 339, "這": 340, "国": 341, "故": 342, "無": 343, "霜": 344, "枚": 345, "涼": 346, "詠": 347, "繁": 348, "苦": 349, "汎": 350, "薇": 351, "染": 352, "舟": 353, "閲": 354, "切": 355, "襟": 356, "茂": 357, "墜": 358, "略": 359, "売": 360, "袢": 361, "衿": 362, "衛": 363, "ダ": 364, "播": 365, "驚": 366, "酪": 367, "泣": 368, "密": 369, "畳": 370, "づ": 371, "礦": 372, "羊": 373, "嫁": 374, "彼": 375, "昧": 376, "回": 377, "邦": 378, "片": 379, "咽": 380, "豆": 381, "懐": 382, "あ": 383, "乏": 384, "油": 385, "振": 386, "塵": 387, "旋": 388, "需": 389, "読": 390, "合": 391, "俺": 392, "麦": 393, "女": 394, "れ": 395, "譜": 396, "包": 397, "ご": 398, "惹": 399, "キ": 400, "殉": 401, "帝": 402, "束": 403, "粛": 404, "脱": 405, "郡": 406, "寒": 407, "匿": 408, "糞": 409, "梅": 410, "村": 411, "翁": 412, "び": 413, "橙": 414, "改": 415, "澤": 416, "灰": 417, "禰": 418, "卸": 419, "伝": 420, "晩": 421, "睨": 422, "息": 423, "千": 424, "老": 425, "通": 426, "馴": 427, "燐": 428, "裕": 429, "弓": 430, "案": 431, "誓": 432, "牛": 433, "腎": 434, "隣": 435, "爽": 436, "虐": 437, "矢": 438, "奪": 439, "収": 440, "ィ": 441, "脳": 442, "屋": 443, "織": 444, "要": 445, "車": 446, "侯": 447, "繰": 448, "番": 449, "應": 450, "体": 451, "醒": 452, "農": 453, "肖": 454, "喚": 455, "京": 456, "搬": 457, "積": 458, "術": 459, "詣": 460, "夫": 461, "タ": 462, "慨": 463, "伐": 464, "脈": 465, "馳": 466, "机": 467, "乗": 468, "斐": 469, "竿": 470, "限": 471, "史": 472, "葉": 473, "仁": 474, "起": 475, "灯": 476, "ァ": 477, "吟": 478, "買": 479, "演": 480, "葬": 481, "盾": 482, "潜": 483, "叉": 484, "語": 485, "霊": 486, "新": 487, "待": 488, "薫": 489, "慣": 490, "追": 491, "漂": 492, "憤": 493, "桐": 494, "梗": 495, "俳": 496, "越": 497, "埼": 498, "疸": 499, "ぁ": 500, "責": 501, "培": 502, "鉱": 503, "曲": 504, "菩": 505, "貨": 506, "施": 507, "古": 508, "距": 509, "抄": 510, "雨": 511, "徒": 512, "導": 513, "推": 514, "冷": 515, "結": 516, "泌": 517, "送": 518, "本": 519, "顧": 520, "皮": 521, "枳": 522, "耐": 523, "に": 524, "靖": 525, "血": 526, "太": 527, "汗": 528, "貫": 529, "只": 530, "関": 531, "暁": 532, "醸": 533, "因": 534, "拐": 535, "玲": 536, "勲": 537, "じ": 538, "雲": 539, "晒": 540, "脊": 541, "給": 542, "機": 543, "洲": 544, "哀": 545, "弊": 546, "綴": 547, "輪": 548, "罹": 549, "徴": 550, "項": 551, "へ": 552, "牧": 553, "細": 554, "璃": 555, "拝": 556, "函": 557, "桟": 558, "韻": 559, "同": 560, "納": 561, "河": 562, "軍": 563, "税": 564, "瀬": 565, "犬": 566, "薄": 567, "鉛": 568, "糸": 569, "邪": 570, "恨": 571, "墾": 572, "併": 573, "期": 574, "循": 575, "寺": 576, "掲": 577, "穀": 578, "春": 579, "朴": 580, "倒": 581, "授": 582, "軒": 583, "狗": 584, "盆": 585, "腹": 586, "匠": 587, "牙": 588, "妻": 589, "言": 590, "紐": 591, "隠": 592, "壮": 593, "緒": 594, "濡": 595, "技": 596, "ま": 597, "エ": 598, "誠": 599, "率": 600, "敢": 601, "ぴ": 602, "凧": 603, "眉": 604, "丼": 605, "経": 606, "習": 607, "統": 608, "畝": 609, "短": 610, "狡": 611, "晴": 612, "穂": 613, "刻": 614, "姻": 615, "特": 616, "唐": 617, "る": 618, "俸": 619, "頃": 620, "数": 621, "愉": 622, "拡": 623, "残": 624, "類": 625, "光": 626, "鋼": 627, "傭": 628, "磁": 629, "尽": 630, "賞": 631, "吠": 632, "附": 633, "窓": 634, "浜": 635, "壌": 636, "岐": 637, "ぬ": 638, "宴": 639, "鋭": 640, "該": 641, "債": 642, "瞬": 643, "ね": 644, "胥": 645, "浴": 646, "戻": 647, "恭": 648, "熱": 649, "齢": 650, "築": 651, "簿": 652, "貸": 653, "ゥ": 654, "泊": 655, "採": 656, "韓": 657, "盃": 658, "右": 659, "門": 660, "跪": 661, "慶": 662, "肛": 663, "牽": 664, "刑": 665, "応": 666, "協": 667, "窟": 668, "雪": 669, "部": 670, "麗": 671, "険": 672, "電": 673, "但": 674, "逮": 675, "席": 676, "褐": 677, "矯": 678, "慮": 679, "市": 680, "論": 681, "を": 682, "騎": 683, "増": 684, "忌": 685, "投": 686, "朕": 687, "剖": 688, "幸": 689, "作": 690, "巳": 691, "資": 692, "賠": 693, "緻": 694, "路": 695, "召": 696, "疇": 697, "排": 698, "狂": 699, "二": 700, "手": 701, "郊": 702, "山": 703, "母": 704, "概": 705, "著": 706, "ロ": 707, "琥": 708, "慎": 709, "尊": 710, "癖": 711, "刈": 712, "強": 713, "孔": 714, "斬": 715, "雌": 716, "崇": 717, "兄": 718, "沖": 719, "荼": 720, "注": 721, "捕": 722, "肋": 723, "化": 724, "途": 725, "最": 726, "兼": 727, "参": 728, "域": 729, "入": 730, "抹": 731, "宰": 732, "臍": 733, "遣": 734, "悦": 735, "ぺ": 736, "品": 737, "世": 738, "讐": 739, "鉄": 740, "瑞": 741, "珂": 742, "興": 743, "凄": 744, "延": 745, "羽": 746, "諦": 747, "疾": 748, "尺": 749, "甚": 750, "討": 751, "竣": 752, "打": 753, "捨": 754, "枕": 755, "量": 756, "務": 757, "叶": 758, "厳": 759, "冗": 760, "煮": 761, "歳": 762, "請": 763, "校": 764, "其": 765, "拉": 766, "唱": 767, "ラ": 768, "靄": 769, "騰": 770, "研": 771, "負": 772, "張": 773, "疎": 774, "砂": 775, "喫": 776, "澄": 777, "滞": 778, "理": 779, "箋": 780, "義": 781, "威": 782, "咎": 783, "湖": 784, "十": 785, "謝": 786, "枢": 787, "紳": 788, "称": 789, "矛": 790, "紺": 791, "悸": 792, "為": 793, "舗": 794, "冶": 795, "益": 796, "藩": 797, "園": 798, "伯": 799, "貿": 800, "備": 801, "物": 802, "藍": 803, "艘": 804, "覧": 805, "載": 806, "恵": 807, "皿": 808, "構": 809, "剽": 810, "膣": 811, "慰": 812, "中": 813, "随": 814, "劫": 815, "さ": 816, "監": 817, "那": 818, "茄": 819, "叔": 820, "腰": 821, "坪": 822, "小": 823, "妙": 824, "半": 825, "桃": 826, "弯": 827, "恐": 828, "診": 829, "壊": 830, "侶": 831, "璽": 832, "終": 833, "荷": 834, "拳": 835, "殿": 836, "祈": 837, "爆": 838, "色": 839, "ち": 840, "腔": 841, "沙": 842, "芝": 843, "貰": 844, "詐": 845, "綿": 846, "絹": 847, "庁": 848, "萄": 849, "敏": 850, "困": 851, "腫": 852, "稼": 853, "転": 854, "供": 855, "噛": 856, "楊": 857, "ム": 858, "示": 859, "瑚": 860, "���": 861, "照": 862, "獲": 863, "怠": 864, "糧": 865, "破": 866, "企": 867, "懇": 868, "汚": 869, "釈": 870, "渋": 871, "制": 872, "裾": 873, "耕": 874, "祥": 875, "封": 876, "丙": 877, "埋": 878, "蹂": 879, "痩": 880, "嘴": 881, "撲": 882, "戦": 883, "卓": 884, "訟": 885, "跨": 886, "感": 887, "漆": 888, "静": 889, "比": 890, "襲": 891, "籍": 892, "景": 893, "鼠": 894, "忍": 895, "漢": 896, "赤": 897, "嗅": 898, "陽": 899, "学": 900, "繭": 901, "砕": 902, "端": 903, "描": 904, "筆": 905, "銅": 906, "眺": 907, "償": 908, "叩": 909, "階": 910, "顕": 911, "垂": 912, "別": 913, "界": 914, "斯": 915, "喜": 916, "呈": 917, "瘍": 918, "塑": 919, "芯": 920, "戴": 921, "占": 922, "ツ": 923, "側": 924, "捗": 925, "覇": 926, "鱠": 927, "現": 928, "恣": 929, "帽": 930, "派": 931, "媒": 932, "億": 933, "貝": 934, "峡": 935, "遠": 936, "丸": 937, "明": 938, "渡": 939, "更": 940, "便": 941, "湾": 942, "狭": 943, "尼": 944, "オ": 945, "侮": 946, "父": 947, "肪": 948, "闕": 949, "院": 950, "髪": 951, "元": 952, "錮": 953, "前": 954, "良": 955, "筑": 956, "コ": 957, "寓": 958, "礼": 959, "鞘": 960, "臭": 961, "柔": 962, "倹": 963, "腕": 964, "艇": 965, "嘱": 966, "だ": 967, "銀": 968, "串": 969, "プ": 970, "賛": 971, "涙": 972, "佐": 973, "穴": 974, "禁": 975, "酒": 976, "稚": 977, "靜": 978, "倣": 979, "妊": 980, "袋": 981, "宮": 982, "ゼ": 983, "購": 984, "大": 985, "パ": 986, "栄": 987, "瓶": 988, "辰": 989, "賑": 990, "堕": 991, "ハ": 992, "折": 993, "則": 994, "駅": 995, "速": 996, "形": 997, "弘": 998, "宿": 999, "え": 1000, "申": 1001, "点": 1002, "慢": 1003, "飽": 1004, "液": 1005, "操": 1006, "献": 1007, "裸": 1008, "痛": 1009, "鳥": 1010, "綻": 1011, "錠": 1012, "純": 1013, "寡": 1014, "唄": 1015, "隆": 1016, "峰": 1017, "欲": 1018, "怨": 1019, "長": 1020, "鎖": 1021, "怪": 1022, "た": 1023, "嫌": 1024, "昆": 1025, "緯": 1026, "撮": 1027, "費": 1028, "香": 1029, "意": 1030, "永": 1031, "坂": 1032, "看": 1033, "台": 1034, "綺": 1035, "廃": 1036, "な": 1037, "杏": 1038, "覚": 1039, "眩": 1040, "東": 1041, "願": 1042, "濃": 1043, "芳": 1044, "造": 1045, "九": 1046, "橋": 1047, "伺": 1048, "楷": 1049, "誘": 1050, "頓": 1051, "娠": 1052, "子": 1053, "祝": 1054, "ヘ": 1055, "絡": 1056, "縦": 1057, "版": 1058, "霧": 1059, "渇": 1060, "型": 1061, "曇": 1062, "ほ": 1063, "級": 1064, "潔": 1065, "固": 1066, "肩": 1067, "椅": 1068, "深": 1069, "ユ": 1070, "把": 1071, "生": 1072, "恥": 1073, "塩": 1074, "賀": 1075, "否": 1076, "っ": 1077, "メ": 1078, "儲": 1079, "頼": 1080, "算": 1081, "う": 1082, "会": 1083, "乞": 1084, "徳": 1085, "冒": 1086, "盟": 1087, "猶": 1088, "駄": 1089, "気": 1090, "ぼ": 1091, "妓": 1092, "味": 1093, "暦": 1094, "鍔": 1095, "悪": 1096, "棺": 1097, "掻": 1098, "紡": 1099, "箸": 1100, "行": 1101, "偵": 1102, "仰": 1103, "容": 1104, "暫": 1105, "懸": 1106, "猿": 1107, "係": 1108, "紅": 1109, "伏": 1110, "粉": 1111, "蛮": 1112, "煙": 1113, "某": 1114, "膳": 1115, "劇": 1116, "族": 1117, "墟": 1118, "心": 1119, "苑": 1120, "災": 1121, "猫": 1122, "糊": 1123, "姜": 1124, "凸": 1125, "遜": 1126, "ズ": 1127, "歓": 1128, "城": 1129, "擁": 1130, "酬": 1131, "累": 1132, "境": 1133, "遺": 1134, "射": 1135, "月": 1136, "稽": 1137, "尖": 1138, "宙": 1139, "臣": 1140, "暇": 1141, "停": 1142, "繋": 1143, "苛": 1144, "刹": 1145, "犯": 1146, "旗": 1147, "典": 1148, "狩": 1149, "踪": 1150, "茜": 1151, "祷": 1152, "漸": 1153, "菱": 1154, "岡": 1155, "姿": 1156, "疑": 1157, "勤": 1158, "薪": 1159, "寂": 1160, "政": 1161, "英": 1162, "闘": 1163, "賂": 1164, "鉤": 1165, "態": 1166, "腺": 1167, "鰭": 1168, "乙": 1169, "僕": 1170, "謎": 1171, "捜": 1172, "含": 1173, "顰": 1174, "篤": 1175, "鑑": 1176, "頻": 1177, "与": 1178, "変": 1179, "凍": 1180, "躊": 1181, "圭": 1182, "ブ": 1183, "躍": 1184, "ぶ": 1185, "浦": 1186, "編": 1187, "瞭": 1188, "爛": 1189, "催": 1190, "砲": 1191, "径": 1192, "曰": 1193, "袖": 1194, "蔽": 1195, "潰": 1196, "音": 1197, "臨": 1198, "ネ": 1199, "彩": 1200, "撤": 1201, "妄": 1202, "競": 1203, "陸": 1204, "独": 1205, "ボ": 1206, "脂": 1207, "陳": 1208, "護": 1209, "段": 1210, "虞": 1211, "搭": 1212, "志": 1213, "槍": 1214, "具": 1215, "逆": 1216, "轄": 1217, "葡": 1218, "も": 1219, "傘": 1220, "契": 1221, "傲": 1222, "説": 1223, "喉": 1224, "依": 1225, "凡": 1226, "副": 1227, "鍵": 1228, "陛": 1229, "峻": 1230, "蒙": 1231, "健": 1232, "去": 1233, "辛": 1234, "彙": 1235, "身": 1236, "髄": 1237, "舌": 1238, "位": 1239, "濁": 1240, "索": 1241, "辿": 1242, "件": 1243, "浸": 1244, "紀": 1245, "早": 1246, "聡": 1247, "汝": 1248, "羅": 1249, "ノ": 1250, "で": 1251, "賭": 1252, "勅": 1253, "塚": 1254, "球": 1255, "横": 1256, "ス": 1257, "邸": 1258, "セ": 1259, "模": 1260, "是": 1261, "署": 1262, "甲": 1263, "ウ": 1264, "遮": 1265, "才": 1266, "錬": 1267, "泡": 1268, "足": 1269, "督": 1270, "く": 1271, "聞": 1272, "陪": 1273, "医": 1274, "卒": 1275, "創": 1276, "先": 1277, "扈": 1278, "州": 1279, "披": 1280, "鼓": 1281, "在": 1282, "寛": 1283, "鱗": 1284, "麒": 1285, "麺": 1286, "炭": 1287, "玄": 1288, "幅": 1289, "救": 1290, "差": 1291, "肝": 1292, "弁": 1293, "鳶": 1294, "吊": 1295, "餐": 1296, "活": 1297, "師": 1298, "風": 1299, "閥": 1300, "溝": 1301, "互": 1302, "成": 1303, "嗜": 1304, "着": 1305, "洗": 1306, "双": 1307, "詮": 1308, "贅": 1309, "滴": 1310, "堅": 1311, "刷": 1312, "航": 1313, "屍": 1314, "翼": 1315, "滋": 1316, "室": 1317, "助": 1318, "朋": 1319, "廷": 1320, "探": 1321, "借": 1322, "峠": 1323, "蜜": 1324, "漏": 1325, "正": 1326, "放": 1327, "巨": 1328, "薔": 1329, "領": 1330, "潮": 1331, "到": 1332, "様": 1333, "患": 1334, "信": 1335, "罅": 1336, "疫": 1337, "用": 1338, "浪": 1339, "畔": 1340, "蓋": 1341, "坦": 1342, "嶋": 1343, "伎": 1344, "蛛": 1345, "建": 1346, "勘": 1347, "腱": 1348, "雰": 1349, "租": 1350, "考": 1351, "褒": 1352, "代": 1353, "杖": 1354, "腸": 1355, "嘩": 1356, "黄": 1357, "緩": 1358, "宇": 1359, "適": 1360, "砦": 1361, "哺": 1362, "宜": 1363, "迎": 1364, "鍛": 1365, "婦": 1366, "べ": 1367, "惜": 1368, "乾": 1369, "憐": 1370, "且": 1371, "来": 1372, "氏": 1373, "よ": 1374, "援": 1375, "と": 1376, "阻": 1377, "墳": 1378, "廊": 1379, "緊": 1380, "淡": 1381, "雇": 1382, "欄": 1383, "週": 1384, "鶴": 1385, "龍": 1386, "広": 1387, "呆": 1388, "桁": 1389, "文": 1390, "炸": 1391, "誕": 1392, "祭": 1393, "趣": 1394, "飯": 1395, "堵": 1396, "裂": 1397, "鎌": 1398, "受": 1399, "飲": 1400, "鳩": 1401, "帆": 1402, "未": 1403, "ケ": 1404, "箪": 1405, "お": 1406, "岸": 1407, "嘘": 1408, "傾": 1409, "遷": 1410, "薦": 1411, "百": 1412, "焼": 1413, "伍": 1414, "氷": 1415, "堺": 1416, "叫": 1417, "棄": 1418, "障": 1419, "武": 1420, "ぽ": 1421, "安": 1422, "厄": 1423, "員": 1424, "垣": 1425, "騒": 1426, "丘": 1427, "郵": 1428, "誇": 1429, "ポ": 1430, "聊": 1431, "狙": 1432, "葛": 1433, "拷": 1434, "維": 1435, "旅": 1436, "楕": 1437, "釜": 1438, "酸": 1439, "痴": 1440, "間": 1441, "フ": 1442, "群": 1443, "波": 1444, "滲": 1445, "縮": 1446, "摩": 1447, "顔": 1448, "カ": 1449, "鬘": 1450, "藻": 1451, "仲": 1452, "促": 1453, "ぐ": 1454, "癇": 1455, "姉": 1456, "堪": 1457, "飛": 1458, "ゅ": 1459, "凝": 1460, "祉": 1461, "訃": 1462, "久": 1463, "巻": 1464, "礎": 1465, "官": 1466, "牲": 1467, "艦": 1468, "鈴": 1469, "客": 1470, "究": 1471, "績": 1472, "謁": 1473, "勇": 1474, "盗": 1475, "届": 1476, "熊": 1477, "致": 1478, "ペ": 1479, "商": 1480, "又": 1481, "朽": 1482, "阪": 1483, "喰": 1484, "琴": 1485, "旨": 1486, "値": 1487, "晶": 1488, "臓": 1489, "夕": 1490, "組": 1491, "他": 1492, "住": 1493, "暴": 1494, "融": 1495, "相": 1496, "害": 1497, "餌": 1498, "栓": 1499, "針": 1500, "ん": 1501, "遵": 1502, "森": 1503, "渦": 1504, "慈": 1505, "ェ": 1506, "払": 1507, "鵬": 1508, "閉": 1509, "乱": 1510, "離": 1511, "満": 1512, "勢": 1513, "拙": 1514, "幾": 1515, "戊": 1516, "崩": 1517, "弧": 1518, "板": 1519, "媛": 1520, "胡": 1521, "ヨ": 1522, "高": 1523, "ず": 1524, "憚": 1525, "床": 1526, "享": 1527, "躯": 1528, "涯": 1529, "丈": 1530, "閣": 1531, "庸": 1532, "荒": 1533, "畏": 1534, "噌": 1535, "博": 1536, "薯": 1537, "堂": 1538, "槽": 1539, "曽": 1540, "鯨": 1541, "劣": 1542, "鎮": 1543, "浄": 1544, "紙": 1545, "汽": 1546, "不": 1547, "調": 1548, "道": 1549, "販": 1550, "吏": 1551, "還": 1552, "准": 1553, "符": 1554, "鰻": 1555, "偉": 1556, "拭": 1557, "笠": 1558, "何": 1559, "ン": 1560, "賢": 1561, "善": 1562, "快": 1563, "祠": 1564, "ナ": 1565, "き": 1566, "書": 1567, "諺": 1568, "盛": 1569, "塁": 1570, "達": 1571, "尾": 1572, "薩": 1573, "問": 1574, "情": 1575, "罠": 1576, "誌": 1577, "般": 1578, "内": 1579, "て": 1580, "鷹": 1581, "毎": 1582, "棒": 1583, "栗": 1584, "及": 1585, "幌": 1586, "誤": 1587, "公": 1588, "橘": 1589, "餓": 1590, "漿": 1591, "欣": 1592, "愛": 1593, "目": 1594, "巣": 1595, "象": 1596, "奇": 1597, "ゆ": 1598, "取": 1599, "述": 1600, "郭": 1601, "厭": 1602, "療": 1603, "粗": 1604, "米": 1605, "罰": 1606, "謙": 1607, "詩": 1608, "グ": 1609, "就": 1610, "透": 1611, "ョ": 1612, "浅": 1613, "各": 1614, "罵": 1615, "検": 1616, "可": 1617, "ぷ": 1618, "つ": 1619, "奈": 1620, "福": 1621, "忠": 1622, "挟": 1623, "ォ": 1624, "リ": 1625, "征": 1626, "彫": 1627, "せ": 1628, "党": 1629, "刺": 1630, "拒": 1631, "巡": 1632, "頑": 1633, "己": 1634, "北": 1635, "外": 1636, "扁": 1637, "船": 1638, "掌": 1639, "胎": 1640, "燗": 1641, "株": 1642, "婿": 1643, "仙": 1644, "表": 1645, "華": 1646, "遂": 1647, "麟": 1648, "挫": 1649, "瑠": 1650, "喪": 1651, "夷": 1652, "絶": 1653, "直": 1654, "難": 1655, "蔑": 1656, "逓": 1657, "堆": 1658, "斑": 1659, "孫": 1660, "観": 1661, "海": 1662, "鐘": 1663, "乃": 1664, "再": 1665, "平": 1666, "戸": 1667, "嵐": 1668, "系": 1669, "渓": 1670, "評": 1671, "絆": 1672, "殺": 1673, "鞄": 1674, "布": 1675, "虚": 1676, "諧": 1677, "絨": 1678, "拘": 1679, "斉": 1680, "剣": 1681, "力": 1682, "淵": 1683, "尉": 1684, "奴": 1685, "労": 1686, "こ": 1687, "紫": 1688, "戚": 1689, "午": 1690, "温": 1691, "篭": 1692, "筒": 1693, "閑": 1694, "衡": 1695, "饅": 1696, "口": 1697, "ト": 1698, "酢": 1699, "劾": 1700, "植": 1701, "準": 1702, "轟": 1703, "衷": 1704, "硫": 1705, "諮": 1706, "慕": 1707, "湧": 1708, "嘔": 1709, "県": 1710, "斤": 1711, "程": 1712, "街": 1713, "睦": 1714, "計": 1715, "利": 1716, "徹": 1717, "地": 1718, "拠": 1719, "仕": 1720, "試": 1721, "蓮": 1722, "擬": 1723, "呪": 1724, "治": 1725, "魚": 1726, "輩": 1727, "失": 1728, "虹": 1729, "星": 1730, "翻": 1731, "げ": 1732, "笹": 1733, "ア": 1734, "は": 1735, "喧": 1736, "譲": 1737, "刃": 1738, "訂": 1739, "ょ": 1740, "朗": 1741, "流": 1742, "猟": 1743, "み": 1744, "棋": 1745, "努": 1746, "兆": 1747, "雷": 1748, "匂": 1749, "胴": 1750, "映": 1751, "抒": 1752, "三": 1753, "普": 1754, "疵": 1755, "措": 1756, "芽": 1757, "異": 1758, "査": 1759, "颯": 1760, "憎": 1761, "皆": 1762, "于": 1763, "配": 1764, "順": 1765, "惨": 1766, "棟": 1767, "缶": 1768, "鼻": 1769, "摘": 1770, "預": 1771, "扱": 1772, "桜": 1773, "夢": 1774, "堀": 1775, "笑": 1776, "候": 1777, "担": 1778, "屠": 1779, "憲": 1780, "怒": 1781, "清": 1782, "鞭": 1783, "漁": 1784, "原": 1785, "宅": 1786, "状": 1787, "守": 1788, "拍": 1789, "察": 1790, "わ": 1791, "悟": 1792, "蔭": 1793, "赦": 1794, "持": 1795, "赴": 1796, "燥": 1797, "瞳": 1798, "ク": 1799, "沃": 1800, "分": 1801, "汰": 1802, "者": 1803, "火": 1804, "逃": 1805, "置": 1806, "暖": 1807, "溶": 1808, "欧": 1809, "紹": 1810, "背": 1811, "陀": 1812, "秘": 1813, "下": 1814, "票": 1815, "悔": 1816, "事": 1817, "尋": 1818, "飢": 1819, "昨": 1820, "民": 1821, "干": 1822, "侵": 1823, "有": 1824, "痘": 1825, "引": 1826, "脹": 1827, "沿": 1828, "承": 1829, "枯": 1830, "斜": 1831, "警": 1832, "兵": 1833, "出": 1834, "濫": 1835, "欠": 1836, "認": 1837, "押": 1838, "専": 1839, "留": 1840, "屯": 1841, "節": 1842, "飾": 1843, "蝶": 1844, "銘": 1845, "獄": 1846, "宛": 1847, "扉": 1848, "壱": 1849, "騙": 1850, "済": 1851, "熟": 1852, "懲": 1853, "緑": 1854, "課": 1855, "酵": 1856, "躙": 1857, "墓": 1858, "玩": 1859, "蛇": 1860, "奉": 1861, "噴": 1862, "庶": 1863, "遇": 1864, "洋": 1865, "日": 1866, "架": 1867, "哨": 1868, "序": 1869, "汁": 1870, "倉": 1871, "勉": 1872, "ぉ": 1873, "ヒ": 1874, "暮": 1875, "藤": 1876, "拾": 1877, "控": 1878, "鮫": 1879, "策": 1880, "械": 1881, "穏": 1882, "蝦": 1883, "微": 1884, "瓜": 1885, "聖": 1886, "洞": 1887, "胆": 1888, "暢": 1889, "菌": 1890, "牌": 1891, "左": 1892, "司": 1893, "免": 1894, "讃": 1895, "貯": 1896, "権": 1897, "跳": 1898, "握": 1899, "苔": 1900, "宦": 1901, "違": 1902, "滅": 1903, "多": 1904, "哲": 1905, "姪": 1906, "蚕": 1907, "ゾ": 1908, "貞": 1909, "竜": 1910, "塾": 1911, "名": 1912, "告": 1913, "総": 1914, "曜": 1915, "窃": 1916, "詔": 1917, "ろ": 1918, "仮": 1919, "飄": 1920, "顎": 1921, "谷": 1922, "里": 1923, "饒": 1924, "壕": 1925, "従": 1926, "け": 1927, "実": 1928, "痕": 1929, "諸": 1930, "陵": 1931, "沢": 1932, "羞": 1933, "斎": 1934, "由": 1935, "鴉": 1936, "杯": 1937, "兜": 1938, "愁": 1939, "館": 1940, "憑": 1941, "層": 1942, "雑": 1943, "業": 1944, "貼": 1945, "重": 1946, "営": 1947, "迭": 1948, "荘": 1949, "主": 1950, "喝": 1951, "奨": 1952, "ホ": 1953, "帯": 1954, "令": 1955, "冥": 1956, "繍": 1957, "水": 1958, "話": 1959, "ら": 1960, "樫": 1961, "金": 1962, "傍": 1963, "枠": 1964, "棚": 1965, "必": 1966, "硝": 1967, "虫": 1968, "虜": 1969, "珊": 1970, "命": 1971, "止": 1972, "ビ": 1973, "加": 1974, "錨": 1975, "礬": 1976, "鱒": 1977, "昇": 1978, "肺": 1979, "辱": 1980, "募": 1981, "祐": 1982, "南": 1983, "求": 1984, "旺": 1985, "一": 1986, "角": 1987, "り": 1988, "掛": 1989, "寅": 1990, "均": 1991, "攀": 1992, "芸": 1993, "紋": 1994, "厠": 1995, "六": 1996, "衝": 1997, "几": 1998, "当": 1999, "寸": 2000, "超": 2001, "炉": 2002, "断": 2003, "燃": 2004, "夜": 2005, "似": 2006, "毬": 2007, "頭": 2008, "念": 2009, "皇": 2010, "材": 2011, "惑": 2012, "そ": 2013, "房": 2014, "シ": 2015, "朝": 2016, "叱": 2017, "舞": 2018, "締": 2019, "約": 2020, "黙": 2021, "漠": 2022, "董": 2023, "希": 2024, "酌": 2025, "禿": 2026, "沸": 2027, "雅": 2028, "ヤ": 2029, "鋳": 2030, "製": 2031, "軟": 2032, "進": 2033, "茅": 2034, "窩": 2035, "挙": 2036, "輝": 2037, "舎": 2038, "発": 2039, "肴": 2040, "臼": 2041, "叙": 2042, "婚": 2043, "洩": 2044, "鷲": 2045, "康": 2046, "唸": 2047, "基": 2048, "眈": 2049, "枡": 2050, "掴": 2051, "潟": 2052, "保": 2053, "蜂": 2054, "鬼": 2055, "瓦": 2056, "万": 2057, "諏": 2058, "腐": 2059, "遊": 2060, "糾": 2061, "拓": 2062, "初": 2063, "唯": 2064, "迅": 2065, "膝": 2066, "聳": 2067, "か": 2068, "寄": 2069, "果": 2070, "舛": 2071, "摂": 2072, "冠": 2073, "翌": 2074, "素": 2075, "帥": 2076, "倍": 2077, "狼": 2078, "稿": 2079, "柳": 2080, "休": 2081, "補": 2082, "銭": 2083, "歌": 2084, "爪": 2085, "陶": 2086, "凹": 2087, "衰": 2088, "賤": 2089, "袍": 2090, "港": 2091, "移": 2092, "ひ": 2093, "粋": 2094, "防": 2095, "禅": 2096, "レ": 2097, "予": 2098, "阿": 2099, "抜": 2100, "寿": 2101, "罪": 2102, "捻": 2103, "撚": 2104, "垢": 2105, "坊": 2106, "磨": 2107, "卜": 2108, "頬": 2109, "塀": 2110, "繊": 2111, "珀": 2112, "見": 2113, "迫": 2114, "乳": 2115, "択": 2116, "争": 2117, "渉": 2118, "哉": 2119, "撒": 2120, "毛": 2121, "坑": 2122, "器": 2123, "茎": 2124, "使": 2125, "揃": 2126, "混": 2127, "憩": 2128, "焦": 2129, "影": 2130, "社": 2131, "虎": 2132, "徐": 2133, "駐": 2134, "沈": 2135, "い": 2136, "豪": 2137, "鉢": 2138, "銃": 2139, "隷": 2140, "範": 2142, "賽": 2143, "連": 2144, "灼": 2145, "軌": 2146, "崎": 2147, "幹": 2148, "儀": 2149, "蝉": 2150, "朱": 2151, "次": 2152, "託": 2153, "ガ": 2154, "露": 2155, "第": 2156, "欺": 2157, "綱": 2158, "降": 2159, "瞞": 2160, "央": 2161, "竈": 2162, "所": 2163, "科": 2164, "秩": 2165, "妬": 2166, "遍": 2167, "辣": 2168, "娼": 2169, "験": 2170, "響": 2171, "攫": 2172, "頤": 2173, "繕": 2174, "育": 2175, "籠": 2176, "疲": 2177, "頚": 2178, "貢": 2179, "僧": 2180, "贈": 2181, "楽": 2182, "殴": 2183, "写": 2184, "空": 2185, "嘆": 2186, "錐": 2187, "娯": 2188, "抑": 2189, "若": 2190, "例": 2191, "款": 2192, "規": 2193, "蔵": 2194, "季": 2195, "局": 2196, "敵": 2197, "丞": 2198, "面": 2199, "美": 2200, "迷": 2201, "居": 2202, "展": 2203, "揺": 2204, "帳": 2205, "癌": 2206, "鉦": 2207, "君": 2208, "姓": 2209, "答": 2210, "錘": 2211, "完": 2212, "窒": 2213, "慌": 2214, "珠": 2215, "逸": 2216, "批": 2217, "膜": 2218, "江": 2219, "提": 2220, "眠": 2221, "鏡": 2222, "教": 2223, "簡": 2224, "単": 2225, "憂": 2226, "即": 2227, "駒": 2228, "屁": 2229, "鈍": 2230, "ぎ": 2231, "画": 2232, "枝": 2233, "獅": 2234, "弐": 2235, "望": 2236, "搾": 2237, "損": 2238, "木": 2239, "沼": 2240, "粧": 2241, "酔": 2242, "挑": 2243, "卵": 2244, "懺": 2245, "審": 2246, "詳": 2247, "判": 2248, "滑": 2249, "蛍": 2250, "丁": 2251, "友": 2252, "町": 2253, "刀": 2254, "歯": 2255, "餃": 2256, "鯉": 2257, "復": 2258, "以": 2259, "散": 2260, "撃": 2261, "縁": 2262, "誰": 2263, "マ": 2264, "号": 2265, "灘": 2266, "個": 2267, "饉": 2268, "殆": 2269, "土": 2270, "賃": 2271, "禍": 2272, "偶": 2273, "扶": 2274, "窮": 2275, "抽": 2276, "孝": 2277, "花": 2278, "島": 2279, "跡": 2280, "祀": 2281, "肌": 2282, "賊": 2283, "丹": 2284, "式": 2285, "捧": 2286, "逝": 2287, "克": 2288, "采": 2289, "訪": 2290, "餅": 2291, "既": 2292, "服": 2293, "罷": 2294, "価": 2295, "ギ": 2296, "時": 2297, "ヴ": 2298, "淑": 2299, "開": 2300, "ッ": 2301, "寥": 2302, "柿": 2303, "漬": 2304, "弦": 2305, "図": 2306, "毀": 2307, "悠": 2308, "敬": 2309, "紛": 2310, "豊": 2311, "ば": 2312, "修": 2313, "伴": 2314, "磯": 2315, "定": 2316, "続": 2317, "凛": 2318, "隙": 2319, "鹿": 2320, "杳": 2321, "嚇": 2322, "声": 2323, "諭": 2324, "頸": 2325, "氣": 2326, "柵": 2327, "厚": 2328, "魔": 2329, "幼": 2330, "琉": 2331, "践": 2332, "煎": 2333, "肘": 2334, "確": 2335, "処": 2336, "穫": 2337, "剪": 2338, "囲": 2339, "骨": 2340, "柱": 2341, "走": 2342, "冬": 2343, "侍": 2344, "粒": 2345, "減": 2346, "錦": 2347, "股": 2348, "モ": 2349, "秀": 2350, "鞍": 2351, "却": 2352, "郷": 2353, "椒": 2354, "弾": 2355, "阜": 2356, "廂": 2357, "剰": 2358, "退": 2359, "革": 2360, "圧": 2361, "憺": 2362, "膨": 2363, "遥": 2364, "し": 2365, "換": 2366, "臆": 2367, "恒": 2368, "富": 2369, "亀": 2370, "捉": 2371, "惧": 2372, "反": 2373, "栽": 2374, "ミ": 2375, "煉": 2376, "噂": 2377, "漱": 2378, "貌": 2379, "慄": 2380, "竹": 2381, "滝": 2382, "溢": 2383, "恢": 2384, "廻": 2385, "忙": 2386, "雛": 2387, "菓": 2388, "璧": 2389, "茨": 2390, "抗": 2391, "執": 2392, "蝕": 2393, "題": 2394, "札": 2395, "川": 2396, "的": 2397, "訳": 2398, "付": 2399, "稲": 2400, "脇": 2401, "埃": 2402, "笛": 2403, "挿": 2404, "擦": 2405, "衣": 2406, "デ": 2407, "盲": 2408, "妖": 2409, "網": 2410, "幻": 2411, "塞": 2412, "楼": 2413, "綾": 2414, "娘": 2415, "踊": 2416, "殊": 2417, "怖": 2418, "煽": 2419, "弔": 2420, "削": 2421, "秒": 2422, "耗": 2423, "雀": 2424, "際": 2425, "弟": 2426, "蘇": 2427, "石": 2428, "甘": 2429, "今": 2430, "辞": 2431, "掟": 2432, "舶": 2433, "篩": 2434, "冊": 2435, "釣": 2436, "柄": 2437, "嚥": 2438, "非": 2439, "妃": 2440, "傑": 2441, "須": 2442, "韮": 2443, "得": 2444, "宵": 2445, "能": 2446, "西": 2447, "��": 2448, "余": 2449, "恋": 2450, "豹": 2451, "斥": 2452, "揚": 2453, "菜": 2454, "胞": 2455, "毯": 2456, "遭": 2457, "晰": 2458, "扮": 2459, "病": 2460, "寧": 2461, "諾": 2462, "泳": 2463, "自": 2464, "誉": 2465, "洒": 2466, "錯": 2467, "歪": 2468, "迦": 2469, "賄": 2470, "嚢": 2471, "裁": 2472, "羹": 2473, "昏": 2474, "后": 2475, "字": 2476, "向": 2477, "亜": 2478, "灌": 2479, "ニ": 2480, "種": 2481, "々": 2482, "団": 2483, "低": 2484, "杉": 2485, "湿": 2486, "脚": 2487, "弥": 2488, "宗": 2489, "托": 2490, "吸": 2491, "根": 2492, "屈": 2493, "升": 2494, "軽": 2495, "獣": 2496, "彰": 2497, "危": 2498, "悼": 2499, "励": 2500, "檎": 2501, "劉": 2502, "込": 2503, "訴": 2504, "箔": 2505, "脅": 2506, "蒸": 2507, "嵌": 2508, "ふ": 2509, "芋": 2510, "貧": 2511, "溺": 2512, "脆": 2513, "奔": 2514, "倫": 2515, "纏": 2516, "田": 2517, "之": 2518, "炎": 2519, "五": 2520, "鵜": 2521, "髣": 2522, "曹": 2523, "突": 2524, "賜": 2525, "姦": 2526, "委": 2527, "常": 2528, "眼": 2529, "末": 2530, "継": 2531, "過": 2532, "鉾": 2533, "戒": 2534, "嫡": 2535, "場": 2536, "姫": 2537, "鮮": 2538, "整": 2539, "耳": 2540, "王": 2541, "潤": 2542, "胸": 2543, "喋": 2544, "蚊": 2545, "簀": 2546, "飼": 2547, "憧": 2548, "料": 2549, "尚": 2550, "肉": 2551, "易": 2552, "ぜ": 2553, "標": 2554, "隻": 2555, "肯": 2556, "蜘": 2557, "ゴ": 2558, "む": 2559, "勃": 2560, "洪": 2561, "宋": 2562, "め": 2563, "塊": 2564, "匹": 2565, "後": 2566, "貪": 2567, "隊": 2568, "咲": 2569, "池": 2570, "府": 2571, "チ": 2572, "招": 2573, "麓": 2574, "許": 2575, "渥": 2576, "嬉": 2577, "閃": 2578, "辺": 2579, "添": 2580, "症": 2581, "壁": 2582, "幕": 2583, "偏": 2584, "魂": 2585, "憶": 2586, "ヶ": 2587, "塹": 2588, "吉": 2589, "鍬": 2590, "飴": 2591, "士": 2592, "狐": 2593, "蹴": 2594, "券": 2595, "養": 2596, "惰": 2597, "ル": 2598, "好": 2599, "碁": 2600, "食": 2601, "縛": 2602, "集": 2603, "七": 2604, "運": 2605, "黒": 2606, "据": 2607, "舷": 2608, "寵": 2609, "卿": 2610, "ベ": 2611, "吹": 2612, "浮": 2613, "功": 2614, "鍋": 2615, "嫉": 2616, "坐": 2617, "青": 2618, "財": 2619, "馬": 2620, "条": 2621, "管": 2622, "仏": 2623, "塗": 2624, "都": 2625, "八": 2626, "没": 2627, "氾": 2628, "萎": 2629, "泉": 2630, "靴": 2631, "熔": 2632, "櫓": 2633, "松": 2634, "尿": 2635, "況": 2636, "敷": 2637, "泥": 2638, "盤": 2639, "玉": 2640, "梨": 2641, "剛": 2642, "麻": 2643, "畿": 2644, "骸": 2645, "落": 2646, "周": 2647, "桑": 2648, "謄": 2649, "揮": 2650, "旦": 2651, "勧": 2652, "逐": 2653, "優": 2654, "粘": 2655, "度": 2656, "バ": 2657, "頂": 2658, "蟲": 2659, "議": 2660, "ぞ": 2661, "始": 2662, "触": 2663, "聴": 2664, "詫": 2665, "年": 2666, "柑": 2667, "憾": 2668, "祟": 2669, "輸": 2670, "陣": 2671, "児": 2672, "接": 2673, "畑": 2674, "属": 2675, "記": 2676, "隔": 2677, "伸": 2678, "剤": 2679, "産": 2680, "印": 2681, "テ": 2682, "昼": 2683, "烈": 2684, "套": 2685, "井": 2686, "肢": 2687, "筋": 2688, "酷": 2689, "遡": 2690, "覆": 2691, "白": 2692, "祖": 2693, "幣": 2694, "箱": 2695, "激": 2696, "|": 2141, "[UNK]": 2697, "[PAD]": 2698}