Vu Minh Chien
commited on
Commit
·
5fe9736
1
Parent(s):
3c6b4f9
Add model files
Browse files- Fine-Tune-Wav2Vec2-Large-XLSR-Japan.ipynb +521 -0
- README.md +113 -0
- config.json +76 -0
- preprocessor_config.json +8 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- training_args.bin +3 -0
- vocab.json +1 -0
Fine-Tune-Wav2Vec2-Large-XLSR-Japan.ipynb
ADDED
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"language_info": {
|
4 |
+
"codemirror_mode": {
|
5 |
+
"name": "ipython",
|
6 |
+
"version": 3
|
7 |
+
},
|
8 |
+
"file_extension": ".py",
|
9 |
+
"mimetype": "text/x-python",
|
10 |
+
"name": "python",
|
11 |
+
"nbconvert_exporter": "python",
|
12 |
+
"pygments_lexer": "ipython3",
|
13 |
+
"version": 3
|
14 |
+
},
|
15 |
+
"orig_nbformat": 2
|
16 |
+
},
|
17 |
+
"nbformat": 4,
|
18 |
+
"nbformat_minor": 2,
|
19 |
+
"cells": [
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": null,
|
23 |
+
"metadata": {},
|
24 |
+
"outputs": [],
|
25 |
+
"source": [
|
26 |
+
"%%capture\n",
|
27 |
+
"!pip install datasets==1.4.1\n",
|
28 |
+
"!pip install transformers==4.4.0\n",
|
29 |
+
"!pip install torchaudio\n",
|
30 |
+
"!pip install librosa\n",
|
31 |
+
"!pip install jiwer\n",
|
32 |
+
"!pip install mecab-python3\n",
|
33 |
+
"!pip install unidic-lite\n",
|
34 |
+
"!pip isntall audiomentations"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": null,
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor\n",
|
44 |
+
"from datasets import load_dataset, load_metric, ClassLabel, Dataset\n",
|
45 |
+
"from audiomentations import Compose, AddGaussianNoise, Gain, PitchShift, TimeStretch, Shift\n",
|
46 |
+
"from torch.optim.lr_scheduler import LambdaLR\n",
|
47 |
+
"from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer\n",
|
48 |
+
"\n",
|
49 |
+
"import pandas as pd\n",
|
50 |
+
"import numpy as np\n",
|
51 |
+
"import soundfile as sf\n",
|
52 |
+
"import re\n",
|
53 |
+
"import json\n",
|
54 |
+
"import torchaudio\n",
|
55 |
+
"import librosa\n",
|
56 |
+
"import datasets\n",
|
57 |
+
"import MeCab\n",
|
58 |
+
"import pykakasi\n",
|
59 |
+
"import random\n",
|
60 |
+
"\n",
|
61 |
+
"import torch\n",
|
62 |
+
"from dataclasses import dataclass, field\n",
|
63 |
+
"from typing import Any, Dict, List, Optional, Union"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"source": [
|
68 |
+
"# Load dataset and prepare processor"
|
69 |
+
],
|
70 |
+
"cell_type": "markdown",
|
71 |
+
"metadata": {}
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": null,
|
76 |
+
"metadata": {},
|
77 |
+
"outputs": [],
|
78 |
+
"source": [
|
79 |
+
"# Load public dataset from University of Tokyo\n",
|
80 |
+
"!wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip\n",
|
81 |
+
"!unzip jsut_ver1.1.zip\n",
|
82 |
+
"\n",
|
83 |
+
"path = 'jsut_ver1.1/basic5000/'\n",
|
84 |
+
"df = pd.read_csv(path + 'transcript_utf8.txt', header = None, delimiter = \":\", names=[\"path\", \"sentence\"], index_col=False)\n",
|
85 |
+
"df[\"path\"] = df[\"path\"].map(lambda x: path + 'wav/' + x + \".wav\")\n",
|
86 |
+
"df.head()\n",
|
87 |
+
"\n",
|
88 |
+
"jsut_voice_train = Dataset.from_pandas(df)"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": null,
|
94 |
+
"metadata": {},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"# Import training dataset\n",
|
98 |
+
"common_voice_train = load_dataset('common_voice', 'ja',split='train+validation')\n",
|
99 |
+
"common_voice_test = load_dataset('common_voice', 'ja', split='test')\n",
|
100 |
+
"\n",
|
101 |
+
"# Remove unwanted columns\n",
|
102 |
+
"common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
|
103 |
+
"common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
|
104 |
+
"\n",
|
105 |
+
"# Concat common voice and public dataset\n",
|
106 |
+
"common_voice_train = datasets.concatenate_datasets([jsut_voice_train, common_voice_train])"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": null,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"# Parser Japanese sentence. Ex: \"pythonが大好きです\" -> \"python が 大好き です EOS\"\n",
|
116 |
+
"wakati = MeCab.Tagger(\"-Owakati\")\n",
|
117 |
+
"\n",
|
118 |
+
"# Unwanted token\n",
|
119 |
+
"chars_to_ignore_regex = '[\\,\\、\\。\\.\\「\\」\\…\\?\\・]'\n",
|
120 |
+
"\n",
|
121 |
+
"def remove_special_characters(batch):\n",
|
122 |
+
" batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
|
123 |
+
" batch[\"sentence\"] = re.sub(chars_to_ignore_regex,'', batch[\"sentence\"]).strip()\n",
|
124 |
+
" return batch\n",
|
125 |
+
"\n",
|
126 |
+
"common_voice_train = common_voice_train.map(remove_special_characters)\n",
|
127 |
+
"common_voice_test = common_voice_test.map(remove_special_characters)"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": null,
|
133 |
+
"metadata": {},
|
134 |
+
"outputs": [],
|
135 |
+
"source": [
|
136 |
+
"# make vocab file\n",
|
137 |
+
"def extract_all_chars(batch):\n",
|
138 |
+
" all_text = \" \".join(batch[\"sentence\"])\n",
|
139 |
+
" vocab = list(set(all_text))\n",
|
140 |
+
" return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
|
141 |
+
"\n",
|
142 |
+
"# make vocab list and text\n",
|
143 |
+
"vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
|
144 |
+
"vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)\n",
|
145 |
+
"\n",
|
146 |
+
"# concate vocab from train and test set\n",
|
147 |
+
"vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))\n",
|
148 |
+
"vocab_dict = {v: k for k, v in enumerate(vocab_list)}\n",
|
149 |
+
"print(len(vocab_dict))\n",
|
150 |
+
"vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
|
151 |
+
"del vocab_dict[\" \"]\n",
|
152 |
+
"\n",
|
153 |
+
"# create unk and pad token\n",
|
154 |
+
"vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
|
155 |
+
"vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
|
156 |
+
"\n",
|
157 |
+
"# save to json file\n",
|
158 |
+
"with open('vocab.json', 'w') as vocab_file:\n",
|
159 |
+
" json.dump(vocab_dict, vocab_file, indent=2, ensure_ascii=False)"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"cell_type": "code",
|
164 |
+
"execution_count": null,
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [],
|
167 |
+
"source": [
|
168 |
+
"save_dir = \"./output_models\"\n",
|
169 |
+
"# wrap tokenizer and feature extractor to processor\n",
|
170 |
+
"tokenizer = Wav2Vec2CTCTokenizer(\"./vocab_demo.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
|
171 |
+
"feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)\n",
|
172 |
+
"processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)\n",
|
173 |
+
"processor.save_pretrained(save_dir)"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"source": [
|
178 |
+
"# Prepare train and test dataset "
|
179 |
+
],
|
180 |
+
"cell_type": "markdown",
|
181 |
+
"metadata": {}
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"cell_type": "code",
|
185 |
+
"execution_count": null,
|
186 |
+
"metadata": {},
|
187 |
+
"outputs": [],
|
188 |
+
"source": [
|
189 |
+
"# convert audio from 48kHz to 16kHz (standard sample rate of wave2vec model)\n",
|
190 |
+
"def speech_file_to_array_fn(batch):\n",
|
191 |
+
" speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n",
|
192 |
+
" batch[\"speech\"] = librosa.resample(np.asarray(speech_array[0].numpy()), 48_000, 16_000)\n",
|
193 |
+
" batch[\"sampling_rate\"] = 16_000\n",
|
194 |
+
" batch[\"target_text\"] = batch[\"sentence\"]\n",
|
195 |
+
" return batch\n",
|
196 |
+
"\n",
|
197 |
+
"common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names,num_proc=4)\n",
|
198 |
+
"common_voice_test = common_voice_test.map(speech_file_to_array_fn,remove_columns=common_voice_test.column_names, num_proc=4) "
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": null,
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [],
|
206 |
+
"source": [
|
207 |
+
"# do augment to enrich common voice dataset \n",
|
208 |
+
"augment = Compose([\n",
|
209 |
+
" AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.001, p=0.8),\n",
|
210 |
+
" PitchShift(min_semitones=-1, max_semitones=1, p=0.8),\n",
|
211 |
+
" Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8),\n",
|
212 |
+
" TimeStretch(min_rate=0.8, max_rate=1.25, p=0.8)\n",
|
213 |
+
"\n",
|
214 |
+
"])\n",
|
215 |
+
"\n",
|
216 |
+
"def augmented_speech(batch, augment):\n",
|
217 |
+
" samples = np.array(batch[\"speech\"])\n",
|
218 |
+
" batch[\"speech\"] = augment(samples=samples, sample_rate=16000)\n",
|
219 |
+
" batch[\"sampling_rate\"] = 16_000\n",
|
220 |
+
" batch[\"target_text\"] = batch[\"target_text\"]\n",
|
221 |
+
" return batch\n",
|
222 |
+
"\n",
|
223 |
+
"# augument 50% of trainset\n",
|
224 |
+
"common_voice_train_augmented = common_voice_train.train_test_split(test_size = 0.5)['train']\n",
|
225 |
+
"common_voice_train_augmented = common_voice_train_augmented.map(lambda batch: augmented_speech(batch, augment), num_proc=4)\n",
|
226 |
+
"\n",
|
227 |
+
"# concate with trainset\n",
|
228 |
+
"common_voice_train = datasets.concatenate_datasets([common_voice_train_augmented, common_voice_train])"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": null,
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": [
|
237 |
+
"def prepare_dataset(batch):\n",
|
238 |
+
" # check that all files have the correct sampling rate\n",
|
239 |
+
" assert (\n",
|
240 |
+
" len(set(batch[\"sampling_rate\"])) == 1\n",
|
241 |
+
" ), f\"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}.\"\n",
|
242 |
+
"\n",
|
243 |
+
" batch[\"input_values\"] = processor(batch[\"speech\"], sampling_rate=batch[\"sampling_rate\"][0]).input_values\n",
|
244 |
+
" \n",
|
245 |
+
" with processor.as_target_processor():\n",
|
246 |
+
" batch[\"labels\"] = processor(batch[\"target_text\"]).input_ids\n",
|
247 |
+
" return batch\n",
|
248 |
+
" \n",
|
249 |
+
"# prepare dataset\n",
|
250 |
+
"common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)\n",
|
251 |
+
"common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)"
|
252 |
+
]
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"source": [
|
256 |
+
"# Training"
|
257 |
+
],
|
258 |
+
"cell_type": "markdown",
|
259 |
+
"metadata": {}
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"cell_type": "code",
|
263 |
+
"execution_count": null,
|
264 |
+
"metadata": {},
|
265 |
+
"outputs": [],
|
266 |
+
"source": [
|
267 |
+
"# create data collator\n",
|
268 |
+
"@dataclass\n",
|
269 |
+
"class DataCollatorCTCWithPadding:\n",
|
270 |
+
"\n",
|
271 |
+
" processor: Wav2Vec2Processor\n",
|
272 |
+
" padding: Union[bool, str] = True\n",
|
273 |
+
" max_length: Optional[int] = None\n",
|
274 |
+
" max_length_labels: Optional[int] = None\n",
|
275 |
+
" pad_to_multiple_of: Optional[int] = None\n",
|
276 |
+
" pad_to_multiple_of_labels: Optional[int] = None\n",
|
277 |
+
"\n",
|
278 |
+
" def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
|
279 |
+
" input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
|
280 |
+
" label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
|
281 |
+
"\n",
|
282 |
+
" batch = self.processor.pad(\n",
|
283 |
+
" input_features,\n",
|
284 |
+
" padding=self.padding,\n",
|
285 |
+
" max_length=self.max_length,\n",
|
286 |
+
" pad_to_multiple_of=self.pad_to_multiple_of,\n",
|
287 |
+
" return_tensors=\"pt\",\n",
|
288 |
+
" )\n",
|
289 |
+
" with self.processor.as_target_processor():\n",
|
290 |
+
" labels_batch = self.processor.pad(\n",
|
291 |
+
" label_features,\n",
|
292 |
+
" padding=self.padding,\n",
|
293 |
+
" max_length=self.max_length_labels,\n",
|
294 |
+
" pad_to_multiple_of=self.pad_to_multiple_of_labels,\n",
|
295 |
+
" return_tensors=\"pt\",\n",
|
296 |
+
" )\n",
|
297 |
+
"\n",
|
298 |
+
" # replace padding with -100 to ignore loss correctly\n",
|
299 |
+
" labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
|
300 |
+
"\n",
|
301 |
+
" batch[\"labels\"] = labels\n",
|
302 |
+
"\n",
|
303 |
+
" return batch\n",
|
304 |
+
"\n",
|
305 |
+
"data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)"
|
306 |
+
]
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"cell_type": "code",
|
310 |
+
"execution_count": null,
|
311 |
+
"metadata": {},
|
312 |
+
"outputs": [],
|
313 |
+
"source": [
|
314 |
+
"# make metric function\n",
|
315 |
+
"wer_metric = load_metric(\"wer\")\n",
|
316 |
+
"\n",
|
317 |
+
"def compute_metrics(pred):\n",
|
318 |
+
" pred_logits = pred.predictions\n",
|
319 |
+
" pred_ids = np.argmax(pred_logits, axis=-1)\n",
|
320 |
+
"\n",
|
321 |
+
" pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
|
322 |
+
"\n",
|
323 |
+
" pred_str = processor.batch_decode(pred_ids)\n",
|
324 |
+
" # we do not want to group tokens when computing the metrics\n",
|
325 |
+
" label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
|
326 |
+
"\n",
|
327 |
+
" wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
|
328 |
+
"\n",
|
329 |
+
" return {\"wer\": wer}"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"cell_type": "code",
|
334 |
+
"execution_count": null,
|
335 |
+
"metadata": {},
|
336 |
+
"outputs": [],
|
337 |
+
"source": [
|
338 |
+
"# create custom learning scheduler\n",
|
339 |
+
"\n",
|
340 |
+
"# polynomial decay\n",
|
341 |
+
"def get_polynomial_decay_schedule_with_warmup(\n",
|
342 |
+
" optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.2, last_epoch=-1\n",
|
343 |
+
"):\n",
|
344 |
+
"\n",
|
345 |
+
" lr_init = optimizer.defaults[\"lr\"]\n",
|
346 |
+
" assert lr_init > lr_end, f\"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})\"\n",
|
347 |
+
"\n",
|
348 |
+
" def lr_lambda(current_step: int):\n",
|
349 |
+
" if current_step < num_warmup_steps:\n",
|
350 |
+
" return float(current_step) / float(max(1, num_warmup_steps))\n",
|
351 |
+
" elif current_step > num_training_steps:\n",
|
352 |
+
" return lr_end / lr_init # as LambdaLR multiplies by lr_init\n",
|
353 |
+
" else:\n",
|
354 |
+
" lr_range = lr_init - lr_end\n",
|
355 |
+
" decay_steps = num_training_steps - num_warmup_steps\n",
|
356 |
+
" pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps\n",
|
357 |
+
" decay = lr_range * pct_remaining ** power + lr_end\n",
|
358 |
+
" return decay / lr_init # as LambdaLR multiplies by lr_init\n",
|
359 |
+
"\n",
|
360 |
+
" return LambdaLR(optimizer, lr_lambda, last_epoch)\n",
|
361 |
+
" \n",
|
362 |
+
"# wrap custom learning scheduler with trainer\n",
|
363 |
+
"class PolyTrainer(Trainer):\n",
|
364 |
+
" def __init__(self, *args, **kwargs):\n",
|
365 |
+
" super().__init__(*args, **kwargs)\n",
|
366 |
+
" \n",
|
367 |
+
" def create_scheduler(self, num_training_steps: int):\n",
|
368 |
+
" self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(self.optimizer, \n",
|
369 |
+
" num_warmup_steps=self.args.warmup_steps,\n",
|
370 |
+
" num_training_steps=num_training_steps)\n",
|
371 |
+
" def create_optimizer_and_scheduler(self, num_training_steps: int):\n",
|
372 |
+
" self.create_optimizer()\n",
|
373 |
+
" self.create_scheduler(num_training_steps)"
|
374 |
+
]
|
375 |
+
},
|
376 |
+
{
|
377 |
+
"cell_type": "code",
|
378 |
+
"execution_count": null,
|
379 |
+
"metadata": {},
|
380 |
+
"outputs": [],
|
381 |
+
"source": [
|
382 |
+
"# load pretrain model\n",
|
383 |
+
"model = Wav2Vec2ForCTC.from_pretrained(\n",
|
384 |
+
" \"facebook/wav2vec2-large-xlsr-53\", \n",
|
385 |
+
" attention_dropout=0.1,\n",
|
386 |
+
" hidden_dropout=0.1,\n",
|
387 |
+
" feat_proj_dropout=0.1,\n",
|
388 |
+
" mask_time_prob=0.1, \n",
|
389 |
+
" layerdrop=0.1,\n",
|
390 |
+
" gradient_checkpointing=True, \n",
|
391 |
+
" ctc_loss_reduction=\"mean\", \n",
|
392 |
+
" pad_token_id=processor.tokenizer.pad_token_id,\n",
|
393 |
+
" vocab_size=len(processor.tokenizer)\n",
|
394 |
+
")\n",
|
395 |
+
"# free feature extractor\n",
|
396 |
+
"model.freeze_feature_extractor()\n",
|
397 |
+
"\n",
|
398 |
+
"# define train argument\n",
|
399 |
+
"training_args = TrainingArguments(\n",
|
400 |
+
" output_dir=save_dir,\n",
|
401 |
+
" group_by_length=True,\n",
|
402 |
+
" per_device_train_batch_size=32,\n",
|
403 |
+
" gradient_accumulation_steps=2,\n",
|
404 |
+
" evaluation_strategy=\"steps\",\n",
|
405 |
+
" num_train_epochs=200,\n",
|
406 |
+
" fp16=True,\n",
|
407 |
+
" save_steps=2400, \n",
|
408 |
+
" eval_steps=800,\n",
|
409 |
+
" logging_steps=800, \n",
|
410 |
+
" learning_rate=1e-4, \n",
|
411 |
+
" warmup_steps=1500, \n",
|
412 |
+
" save_total_limit=2,\n",
|
413 |
+
" load_best_model_at_end = True, \n",
|
414 |
+
" metric_for_best_model='wer', \n",
|
415 |
+
" greater_is_better=False\n",
|
416 |
+
")\n",
|
417 |
+
"\n",
|
418 |
+
"# wrap everything to Trainer\n",
|
419 |
+
"trainer = PolyTrainer(\n",
|
420 |
+
" model=model,\n",
|
421 |
+
" data_collator=data_collator,\n",
|
422 |
+
" args=training_args,\n",
|
423 |
+
" compute_metrics=compute_metrics,\n",
|
424 |
+
" train_dataset=common_voice_train,\n",
|
425 |
+
" eval_dataset=common_voice_test,\n",
|
426 |
+
" tokenizer=processor.feature_extractor,\n",
|
427 |
+
")"
|
428 |
+
]
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"cell_type": "code",
|
432 |
+
"execution_count": null,
|
433 |
+
"metadata": {},
|
434 |
+
"outputs": [],
|
435 |
+
"source": [
|
436 |
+
"# training\n",
|
437 |
+
"train_result = trainer.train()"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"source": [
|
442 |
+
"# Testing result"
|
443 |
+
],
|
444 |
+
"cell_type": "markdown",
|
445 |
+
"metadata": {}
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": null,
|
450 |
+
"metadata": {},
|
451 |
+
"outputs": [],
|
452 |
+
"source": [
|
453 |
+
"import torch\n",
|
454 |
+
"import torchaudio\n",
|
455 |
+
"from datasets import load_dataset, load_metric\n",
|
456 |
+
"from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
|
457 |
+
"import MeCab\n",
|
458 |
+
"import pykakasi\n",
|
459 |
+
"import re\n",
|
460 |
+
"\n",
|
461 |
+
"#config\n",
|
462 |
+
"wakati = MeCab.Tagger(\"-Owakati\")\n",
|
463 |
+
"chars_to_ignore_regex = '[\\,\\、\\。\\.\\「\\」\\…\\?\\・]'\n",
|
464 |
+
"\n",
|
465 |
+
"#load model\n",
|
466 |
+
"processor = Wav2Vec2Processor.from_pretrained(save_dir)\n",
|
467 |
+
"test_model = Wav2Vec2ForCTC.from_pretrained(save_dir)\n",
|
468 |
+
"test_model.to(\"cuda\")\n",
|
469 |
+
"resampler = torchaudio.transforms.Resample(48_000, 16_000)\n",
|
470 |
+
"\n",
|
471 |
+
"#load testdata\n",
|
472 |
+
"test_dataset = load_dataset(\"common_voice\", \"ja\", split=\"test\")\n",
|
473 |
+
"wer = load_metric(\"wer\")\n",
|
474 |
+
"\n",
|
475 |
+
"# Preprocessing the datasets.\n",
|
476 |
+
"def speech_file_to_array_fn(batch):\n",
|
477 |
+
" batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
|
478 |
+
" batch[\"sentence\"] = re.sub(chars_to_ignore_regex,'', batch[\"sentence\"]).strip()\n",
|
479 |
+
" speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n",
|
480 |
+
" batch[\"speech\"] = resampler(speech_array).squeeze().numpy()\n",
|
481 |
+
" return batch\n",
|
482 |
+
"\n",
|
483 |
+
"test_dataset = test_dataset.map(speech_file_to_array_fn)\n",
|
484 |
+
"\n",
|
485 |
+
"# Preprocessing the datasets.\n",
|
486 |
+
"# We need to read the aduio files as arrays\n",
|
487 |
+
"def evaluate(batch):\n",
|
488 |
+
" inputs = processor(batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n",
|
489 |
+
"\n",
|
490 |
+
" with torch.no_grad():\n",
|
491 |
+
" logits = test_model(inputs.input_values.to(\"cuda\"), attention_mask=inputs.attention_mask.to(\"cuda\")).logits\n",
|
492 |
+
" pred_ids = torch.argmax(logits, dim=-1)\n",
|
493 |
+
" batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
|
494 |
+
" return batch\n",
|
495 |
+
"\n",
|
496 |
+
"result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
|
497 |
+
"\n",
|
498 |
+
"print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))"
|
499 |
+
]
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"execution_count": null,
|
504 |
+
"metadata": {},
|
505 |
+
"outputs": [],
|
506 |
+
"source": [
|
507 |
+
"# print some reusults\n",
|
508 |
+
"pick = random.randint(0, len(common_voice_test_transcription)-1)\n",
|
509 |
+
"input_dict = processor(common_voice_test[\"input_values\"][pick], return_tensors=\"pt\", padding=True)\n",
|
510 |
+
"logits = test_model(input_dict.input_values.to(\"cuda\")).logits\n",
|
511 |
+
"pred_ids = torch.argmax(logits, dim=-1)[0]\n",
|
512 |
+
"\n",
|
513 |
+
"print(\"Prediction:\")\n",
|
514 |
+
"print(processor.decode(pred_ids).strip())\n",
|
515 |
+
"\n",
|
516 |
+
"print(\"\\nLabel:\")\n",
|
517 |
+
"print(processor.decode(common_voice_test['labels'][pick]))\n"
|
518 |
+
]
|
519 |
+
}
|
520 |
+
]
|
521 |
+
}
|
README.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: ja
|
3 |
+
datasets:
|
4 |
+
- common_voice
|
5 |
+
metrics:
|
6 |
+
- wer
|
7 |
+
tags:
|
8 |
+
- audio
|
9 |
+
- automatic-speech-recognition
|
10 |
+
- speech
|
11 |
+
- xlsr-fine-tuning-week
|
12 |
+
license: apache-2.0
|
13 |
+
model-index:
|
14 |
+
- name: XLSR Wav2Vec2 Japanese by Chien Vu
|
15 |
+
results:
|
16 |
+
- task:
|
17 |
+
name: Speech Recognition
|
18 |
+
type: automatic-speech-recognition
|
19 |
+
dataset:
|
20 |
+
name: Common Voice Japanese
|
21 |
+
type: common_voice
|
22 |
+
args: ja
|
23 |
+
metrics:
|
24 |
+
- name: Test WER
|
25 |
+
type: wer
|
26 |
+
value: 46.77
|
27 |
+
---
|
28 |
+
# Wav2Vec2-Large-XLSR-53-Japanese
|
29 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Japanese using the [Common Voice](https://huggingface.co/datasets/common_voice) and Japanese speech corpus of Saruwatari-lab, University of Tokyo [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut).
|
30 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
31 |
+
## Usage
|
32 |
+
The model can be used directly (without a language model) as follows:
|
33 |
+
```python
|
34 |
+
import torch
|
35 |
+
import torchaudio
|
36 |
+
import librosa
|
37 |
+
from datasets import load_dataset
|
38 |
+
import MeCab
|
39 |
+
import pykakasi
|
40 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
41 |
+
|
42 |
+
# config
|
43 |
+
wakati = MeCab.Tagger("-Owakati")
|
44 |
+
chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'
|
45 |
+
|
46 |
+
# load data, processor and model
|
47 |
+
test_dataset = load_dataset("common_voice", "ja", split="test[:2%]")
|
48 |
+
processor = Wav2Vec2Processor.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
|
49 |
+
model = Wav2Vec2ForCTC.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
|
50 |
+
resampler = lambda sr, y: librosa.resample(y.numpy().squeeze(), sr, 16_000)
|
51 |
+
|
52 |
+
# Preprocessing the datasets.
|
53 |
+
def speech_file_to_array_fn(batch):
|
54 |
+
batch["sentence"] = wakati.parse(batch["sentence"]).strip()
|
55 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
|
56 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
57 |
+
batch["speech"] = resampler(sampling_rate, speech_array).squeeze()
|
58 |
+
return batch
|
59 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
60 |
+
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
61 |
+
with torch.no_grad():
|
62 |
+
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
63 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
64 |
+
print("Prediction:", processor.batch_decode(predicted_ids))
|
65 |
+
print("Reference:", test_dataset["sentence"][:2])
|
66 |
+
```
|
67 |
+
## Evaluation
|
68 |
+
The model can be evaluated as follows on the Arabic test data of Common Voice.
|
69 |
+
```python
|
70 |
+
import torch
|
71 |
+
import librosa
|
72 |
+
import torchaudio
|
73 |
+
from datasets import load_dataset
|
74 |
+
import MeCab
|
75 |
+
import pykakasi
|
76 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
77 |
+
|
78 |
+
#config
|
79 |
+
wakati = MeCab.Tagger("-Owakati")
|
80 |
+
chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'
|
81 |
+
|
82 |
+
# load data, processor and model
|
83 |
+
test_dataset = load_dataset("common_voice", "ja", split="test")
|
84 |
+
wer = load_metric("wer")
|
85 |
+
processor = Wav2Vec2Processor.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
|
86 |
+
model = Wav2Vec2ForCTC.from_pretrained("vumichien/wav2vec2-large-xlsr-japanese")
|
87 |
+
model.to("cuda")
|
88 |
+
resampler = lambda sr, y: librosa.resample(y.numpy().squeeze(), sr, 16_000)
|
89 |
+
|
90 |
+
# Preprocessing the datasets.
|
91 |
+
def speech_file_to_array_fn(batch):
|
92 |
+
batch["sentence"] = kakasi.do(wakati.parse(batch["sentence"]).strip())
|
93 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
|
94 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
95 |
+
batch["speech"] = resampler(sampling_rate, speech_array).squeeze()
|
96 |
+
return batch
|
97 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
98 |
+
|
99 |
+
# evaluate function
|
100 |
+
def evaluate(batch):
|
101 |
+
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
102 |
+
with torch.no_grad():
|
103 |
+
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
104 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
105 |
+
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
106 |
+
return batch
|
107 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
108 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
109 |
+
```
|
110 |
+
**Test Result**: 46.77
|
111 |
+
## Training
|
112 |
+
The Common Voice `train`, `validation` datasets and Japanese speech corpus `basic5000` datasets were used for training.
|
113 |
+
The script used for training can be found [here](Fine-Tune-Wav2Vec2-Large-XLSR-Japan.ipynb)
|
config.json
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"apply_spec_augment": true,
|
5 |
+
"architectures": [
|
6 |
+
"Wav2Vec2ForCTC"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"conv_bias": true,
|
11 |
+
"conv_dim": [
|
12 |
+
512,
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512
|
19 |
+
],
|
20 |
+
"conv_kernel": [
|
21 |
+
10,
|
22 |
+
3,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
2,
|
27 |
+
2
|
28 |
+
],
|
29 |
+
"conv_stride": [
|
30 |
+
5,
|
31 |
+
2,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2
|
37 |
+
],
|
38 |
+
"ctc_loss_reduction": "mean",
|
39 |
+
"ctc_zero_infinity": false,
|
40 |
+
"do_stable_layer_norm": true,
|
41 |
+
"eos_token_id": 2,
|
42 |
+
"feat_extract_activation": "gelu",
|
43 |
+
"feat_extract_dropout": 0.0,
|
44 |
+
"feat_extract_norm": "layer",
|
45 |
+
"feat_proj_dropout": 0.1,
|
46 |
+
"final_dropout": 0.0,
|
47 |
+
"gradient_checkpointing": true,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 1024,
|
51 |
+
"initializer_range": 0.02,
|
52 |
+
"intermediate_size": 4096,
|
53 |
+
"layer_norm_eps": 1e-05,
|
54 |
+
"layerdrop": 0.1,
|
55 |
+
"mask_channel_length": 10,
|
56 |
+
"mask_channel_min_space": 1,
|
57 |
+
"mask_channel_other": 0.0,
|
58 |
+
"mask_channel_prob": 0.0,
|
59 |
+
"mask_channel_selection": "static",
|
60 |
+
"mask_feature_length": 10,
|
61 |
+
"mask_feature_prob": 0.0,
|
62 |
+
"mask_time_length": 10,
|
63 |
+
"mask_time_min_space": 1,
|
64 |
+
"mask_time_other": 0.0,
|
65 |
+
"mask_time_prob": 0.1,
|
66 |
+
"mask_time_selection": "static",
|
67 |
+
"model_type": "wav2vec2",
|
68 |
+
"num_attention_heads": 16,
|
69 |
+
"num_conv_pos_embedding_groups": 16,
|
70 |
+
"num_conv_pos_embeddings": 128,
|
71 |
+
"num_feat_extract_layers": 7,
|
72 |
+
"num_hidden_layers": 24,
|
73 |
+
"pad_token_id": 2698,
|
74 |
+
"transformers_version": "4.5.0.dev0",
|
75 |
+
"vocab_size": 2699
|
76 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_size": 1,
|
4 |
+
"padding_side": "right",
|
5 |
+
"padding_value": 0.0,
|
6 |
+
"return_attention_mask": true,
|
7 |
+
"sampling_rate": 16000
|
8 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f20aa1dd1aea1e481a5e87034fe4733dabfb16d6c44096dfc55ecef8fd0777c
|
3 |
+
size 1272999703
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:540a9cb029ba4c351fdb2719b51eb95f3c1683ee51cb25588127285fcfe59420
|
3 |
+
size 2351
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ソ": 0, "消": 1, "講": 2, "核": 3, "撓": 4, "圏": 5, "挽": 6, "鬱": 7, "瓢": 8, "証": 9, "煩": 10, "ヌ": 11, "童": 12, "少": 13, "疹": 14, "昭": 15, "躇": 16, "歴": 17, "淫": 18, "謡": 19, "葺": 20, "炒": 21, "神": 22, "軸": 23, "爬": 24, "談": 25, "趙": 26, "暗": 27, "鳴": 28, "弄": 29, "窯": 30, "御": 31, "全": 32, "雹": 33, "孺": 34, "班": 35, "抱": 36, "源": 37, "摯": 38, "雄": 39, "豚": 40, "焜": 41, "岩": 42, "法": 43, "泰": 44, "役": 45, "智": 46, "避": 47, "省": 48, "巾": 49, "家": 50, "癒": 51, "抵": 52, "想": 53, "環": 54, "襞": 55, "将": 56, "僅": 57, "宣": 58, "妨": 59, "醜": 60, "充": 61, "亡": 62, "犠": 63, "戯": 64, "首": 65, "や": 66, "剥": 67, "ど": 68, "茶": 69, "斗": 70, "艶": 71, "人": 72, "彗": 73, "填": 74, "絵": 75, "草": 76, "曖": 77, "謔": 78, "斧": 79, "サ": 80, "我": 81, "裏": 82, "働": 83, "濯": 84, "秋": 85, "殻": 86, "動": 87, "旧": 88, "像": 89, "謀": 90, "ワ": 91, "苗": 92, "塔": 93, "嘲": 94, "挨": 95, "猛": 96, "両": 97, "掃": 98, "蓄": 99, "ュ": 100, "啄": 101, "襦": 102, "憬": 103, "真": 104, "存": 105, "較": 106, "焚": 107, "偽": 108, "凌": 109, "蹙": 110, "尻": 111, "識": 112, "句": 113, "奥": 114, "崖": 115, "急": 116, "往": 117, "漑": 118, "詰": 119, "拶": 120, "勾": 121, "和": 122, "礁": 123, "謹": 124, "貴": 125, "膚": 126, "痼": 127, "装": 128, "梁": 129, "胃": 130, "髷": 131, "除": 132, "幽": 133, "薬": 134, "痒": 135, "列": 136, "庫": 137, "栃": 138, "頷": 139, "ぱ": 140, "僚": 141, "湯": 142, "死": 143, "亭": 144, "額": 145, "頒": 146, "婆": 147, "任": 148, "等": 149, "複": 150, "奮": 151, "訓": 152, "練": 153, "視": 154, "津": 155, "凶": 156, "方": 157, "性": 158, "格": 159, "覗": 160, "岳": 161, "妹": 162, "夏": 163, "祓": 164, "被": 165, "弱": 166, "闇": 167, "暑": 168, "隅": 169, "曾": 170, "畜": 171, "指": 172, "昔": 173, "橈": 174, "親": 175, "録": 176, "ー": 177, "刊": 178, "帰": 179, "律": 180, "遅": 181, "店": 182, "零": 183, "職": 184, "魅": 185, "巧": 186, "樽": 187, "立": 188, "裔": 189, "歩": 190, "析": 191, "俵": 192, "工": 193, "渠": 194, "支": 195, "勝": 196, "宝": 197, "ド": 198, "ゲ": 199, "聚": 200, "睡": 201, "壇": 202, "肥": 203, "天": 204, "廉": 205, "の": 206, "設": 207, "震": 208, "四": 209, "愚": 210, "錆": 211, "爵": 212, "至": 213, "痢": 214, "炊": 215, "イ": 216, "卑": 217, "槌": 218, "交": 219, "介": 220, "禎": 221, "敗": 222, "旬": 223, "呂": 224, "郎": 225, "吐": 226, "蛭": 227, "俊": 228, "悲": 229, "囚": 230, "衆": 231, "唾": 232, "円": 233, "廓": 234, "ピ": 235, "恩": 236, "啓": 237, "選": 238, "精": 239, "掘": 240, "鶏": 241, "佳": 242, "陰": 243, "毒": 244, "俗": 245, "然": 246, "殖": 247, "呼": 248, "賈": 249, "ざ": 250, "寝": 251, "野": 252, "妥": 253, "詞": 254, "嗣": 255, "熏": 256, "替": 257, "孤": 258, "解": 259, "報": 260, "羨": 261, "す": 262, "縄": 263, "枷": 264, "陥": 265, "極": 266, "踏": 267, "縫": 268, "淋": 269, "忘": 270, "悩": 271, "知": 272, "ジ": 273, "思": 274, "猜": 275, "効": 276, "並": 277, "傷": 278, "が": 279, "唆": 280, "章": 281, "岬": 282, "測": 283, "堤": 284, "私": 285, "如": 286, "庭": 287, "寮": 288, "ザ": 289, "扇": 290, "質": 291, "ャ": 292, "線": 293, "喩": 294, "邑": 295, "区": 296, "登": 297, "賓": 298, "座": 299, "底": 300, "林": 301, "返": 302, "共": 303, "髴": 304, "奏": 305, "鎧": 306, "牢": 307, "硬": 308, "嬢": 309, "決": 310, "携": 311, "糖": 312, "樹": 313, "箇": 314, "漫": 315, "碑": 316, "楓": 317, "攻": 318, "絞": 319, "珍": 320, "呉": 321, "叛": 322, "上": 323, "割": 324, "菊": 325, "儒": 326, "近": 327, "履": 328, "駆": 329, "了": 330, "賦": 331, "檻": 332, "唇": 333, "男": 334, "ゃ": 335, "翠": 336, "墨": 337, "酎": 338, "溜": 339, "這": 340, "国": 341, "故": 342, "無": 343, "霜": 344, "枚": 345, "涼": 346, "詠": 347, "繁": 348, "苦": 349, "汎": 350, "薇": 351, "染": 352, "舟": 353, "閲": 354, "切": 355, "襟": 356, "茂": 357, "墜": 358, "略": 359, "売": 360, "袢": 361, "衿": 362, "衛": 363, "ダ": 364, "播": 365, "驚": 366, "酪": 367, "泣": 368, "密": 369, "畳": 370, "づ": 371, "礦": 372, "羊": 373, "嫁": 374, "彼": 375, "昧": 376, "回": 377, "邦": 378, "片": 379, "咽": 380, "豆": 381, "懐": 382, "あ": 383, "乏": 384, "油": 385, "振": 386, "塵": 387, "旋": 388, "需": 389, "読": 390, "合": 391, "俺": 392, "麦": 393, "女": 394, "れ": 395, "譜": 396, "包": 397, "ご": 398, "惹": 399, "キ": 400, "殉": 401, "帝": 402, "束": 403, "粛": 404, "脱": 405, "郡": 406, "寒": 407, "匿": 408, "糞": 409, "梅": 410, "村": 411, "翁": 412, "び": 413, "橙": 414, "改": 415, "澤": 416, "灰": 417, "禰": 418, "卸": 419, "伝": 420, "晩": 421, "睨": 422, "息": 423, "千": 424, "老": 425, "通": 426, "馴": 427, "燐": 428, "裕": 429, "弓": 430, "案": 431, "誓": 432, "牛": 433, "腎": 434, "隣": 435, "爽": 436, "虐": 437, "矢": 438, "奪": 439, "収": 440, "ィ": 441, "脳": 442, "屋": 443, "織": 444, "要": 445, "車": 446, "侯": 447, "繰": 448, "番": 449, "應": 450, "体": 451, "醒": 452, "農": 453, "肖": 454, "喚": 455, "京": 456, "搬": 457, "積": 458, "術": 459, "詣": 460, "夫": 461, "タ": 462, "慨": 463, "伐": 464, "脈": 465, "馳": 466, "机": 467, "乗": 468, "斐": 469, "竿": 470, "限": 471, "史": 472, "葉": 473, "仁": 474, "起": 475, "灯": 476, "ァ": 477, "吟": 478, "買": 479, "演": 480, "葬": 481, "盾": 482, "潜": 483, "叉": 484, "語": 485, "霊": 486, "新": 487, "待": 488, "薫": 489, "慣": 490, "追": 491, "漂": 492, "憤": 493, "桐": 494, "梗": 495, "俳": 496, "越": 497, "埼": 498, "疸": 499, "ぁ": 500, "責": 501, "培": 502, "鉱": 503, "曲": 504, "菩": 505, "貨": 506, "施": 507, "古": 508, "距": 509, "抄": 510, "雨": 511, "徒": 512, "導": 513, "推": 514, "冷": 515, "結": 516, "泌": 517, "送": 518, "本": 519, "顧": 520, "皮": 521, "枳": 522, "耐": 523, "に": 524, "靖": 525, "血": 526, "太": 527, "汗": 528, "貫": 529, "只": 530, "関": 531, "暁": 532, "醸": 533, "因": 534, "拐": 535, "玲": 536, "勲": 537, "じ": 538, "雲": 539, "晒": 540, "脊": 541, "給": 542, "機": 543, "洲": 544, "哀": 545, "弊": 546, "綴": 547, "輪": 548, "罹": 549, "徴": 550, "項": 551, "へ": 552, "牧": 553, "細": 554, "璃": 555, "拝": 556, "函": 557, "桟": 558, "韻": 559, "同": 560, "納": 561, "河": 562, "軍": 563, "税": 564, "瀬": 565, "犬": 566, "薄": 567, "鉛": 568, "糸": 569, "邪": 570, "恨": 571, "墾": 572, "併": 573, "期": 574, "循": 575, "寺": 576, "掲": 577, "穀": 578, "春": 579, "朴": 580, "倒": 581, "授": 582, "軒": 583, "狗": 584, "盆": 585, "腹": 586, "匠": 587, "牙": 588, "妻": 589, "言": 590, "紐": 591, "隠": 592, "壮": 593, "緒": 594, "濡": 595, "技": 596, "ま": 597, "エ": 598, "誠": 599, "率": 600, "敢": 601, "ぴ": 602, "凧": 603, "眉": 604, "丼": 605, "経": 606, "習": 607, "統": 608, "畝": 609, "短": 610, "狡": 611, "晴": 612, "穂": 613, "刻": 614, "姻": 615, "特": 616, "唐": 617, "る": 618, "俸": 619, "頃": 620, "数": 621, "愉": 622, "拡": 623, "残": 624, "類": 625, "光": 626, "鋼": 627, "傭": 628, "磁": 629, "尽": 630, "賞": 631, "吠": 632, "附": 633, "窓": 634, "浜": 635, "壌": 636, "岐": 637, "ぬ": 638, "宴": 639, "鋭": 640, "該": 641, "債": 642, "瞬": 643, "ね": 644, "胥": 645, "浴": 646, "戻": 647, "恭": 648, "熱": 649, "齢": 650, "築": 651, "簿": 652, "貸": 653, "ゥ": 654, "泊": 655, "採": 656, "韓": 657, "盃": 658, "右": 659, "門": 660, "跪": 661, "慶": 662, "肛": 663, "牽": 664, "刑": 665, "応": 666, "協": 667, "窟": 668, "雪": 669, "部": 670, "麗": 671, "険": 672, "電": 673, "但": 674, "逮": 675, "席": 676, "褐": 677, "矯": 678, "慮": 679, "市": 680, "論": 681, "を": 682, "騎": 683, "増": 684, "忌": 685, "投": 686, "朕": 687, "剖": 688, "幸": 689, "作": 690, "巳": 691, "資": 692, "賠": 693, "緻": 694, "路": 695, "召": 696, "疇": 697, "排": 698, "狂": 699, "二": 700, "手": 701, "郊": 702, "山": 703, "母": 704, "概": 705, "著": 706, "ロ": 707, "琥": 708, "慎": 709, "尊": 710, "癖": 711, "刈": 712, "強": 713, "孔": 714, "斬": 715, "雌": 716, "崇": 717, "兄": 718, "沖": 719, "荼": 720, "注": 721, "捕": 722, "肋": 723, "化": 724, "途": 725, "最": 726, "兼": 727, "参": 728, "域": 729, "入": 730, "抹": 731, "宰": 732, "臍": 733, "遣": 734, "悦": 735, "ぺ": 736, "品": 737, "世": 738, "讐": 739, "鉄": 740, "瑞": 741, "珂": 742, "興": 743, "凄": 744, "延": 745, "羽": 746, "諦": 747, "疾": 748, "尺": 749, "甚": 750, "討": 751, "竣": 752, "打": 753, "捨": 754, "枕": 755, "量": 756, "務": 757, "叶": 758, "厳": 759, "冗": 760, "煮": 761, "歳": 762, "請": 763, "校": 764, "其": 765, "拉": 766, "唱": 767, "ラ": 768, "靄": 769, "騰": 770, "研": 771, "負": 772, "張": 773, "疎": 774, "砂": 775, "喫": 776, "澄": 777, "滞": 778, "理": 779, "箋": 780, "義": 781, "威": 782, "咎": 783, "湖": 784, "十": 785, "謝": 786, "枢": 787, "紳": 788, "称": 789, "矛": 790, "紺": 791, "悸": 792, "為": 793, "舗": 794, "冶": 795, "益": 796, "藩": 797, "園": 798, "伯": 799, "貿": 800, "備": 801, "物": 802, "藍": 803, "艘": 804, "覧": 805, "載": 806, "恵": 807, "皿": 808, "構": 809, "剽": 810, "膣": 811, "慰": 812, "中": 813, "随": 814, "劫": 815, "さ": 816, "監": 817, "那": 818, "茄": 819, "叔": 820, "腰": 821, "坪": 822, "小": 823, "妙": 824, "半": 825, "桃": 826, "弯": 827, "恐": 828, "診": 829, "壊": 830, "侶": 831, "璽": 832, "終": 833, "荷": 834, "拳": 835, "殿": 836, "祈": 837, "爆": 838, "色": 839, "ち": 840, "腔": 841, "沙": 842, "芝": 843, "貰": 844, "詐": 845, "綿": 846, "絹": 847, "庁": 848, "萄": 849, "敏": 850, "困": 851, "腫": 852, "稼": 853, "転": 854, "供": 855, "噛": 856, "楊": 857, "ム": 858, "示": 859, "瑚": 860, "���": 861, "照": 862, "獲": 863, "怠": 864, "糧": 865, "破": 866, "企": 867, "懇": 868, "汚": 869, "釈": 870, "渋": 871, "制": 872, "裾": 873, "耕": 874, "祥": 875, "封": 876, "丙": 877, "埋": 878, "蹂": 879, "痩": 880, "嘴": 881, "撲": 882, "戦": 883, "卓": 884, "訟": 885, "跨": 886, "感": 887, "漆": 888, "静": 889, "比": 890, "襲": 891, "籍": 892, "景": 893, "鼠": 894, "忍": 895, "漢": 896, "赤": 897, "嗅": 898, "陽": 899, "学": 900, "繭": 901, "砕": 902, "端": 903, "描": 904, "筆": 905, "銅": 906, "眺": 907, "償": 908, "叩": 909, "階": 910, "顕": 911, "垂": 912, "別": 913, "界": 914, "斯": 915, "喜": 916, "呈": 917, "瘍": 918, "塑": 919, "芯": 920, "戴": 921, "占": 922, "ツ": 923, "側": 924, "捗": 925, "覇": 926, "鱠": 927, "現": 928, "恣": 929, "帽": 930, "派": 931, "媒": 932, "億": 933, "貝": 934, "峡": 935, "遠": 936, "丸": 937, "明": 938, "渡": 939, "更": 940, "便": 941, "湾": 942, "狭": 943, "尼": 944, "オ": 945, "侮": 946, "父": 947, "肪": 948, "闕": 949, "院": 950, "髪": 951, "元": 952, "錮": 953, "前": 954, "良": 955, "筑": 956, "コ": 957, "寓": 958, "礼": 959, "鞘": 960, "臭": 961, "柔": 962, "倹": 963, "腕": 964, "艇": 965, "嘱": 966, "だ": 967, "銀": 968, "串": 969, "プ": 970, "賛": 971, "涙": 972, "佐": 973, "穴": 974, "禁": 975, "酒": 976, "稚": 977, "靜": 978, "倣": 979, "妊": 980, "袋": 981, "宮": 982, "ゼ": 983, "購": 984, "大": 985, "パ": 986, "栄": 987, "瓶": 988, "辰": 989, "賑": 990, "堕": 991, "ハ": 992, "折": 993, "則": 994, "駅": 995, "速": 996, "形": 997, "弘": 998, "宿": 999, "え": 1000, "申": 1001, "点": 1002, "慢": 1003, "飽": 1004, "液": 1005, "操": 1006, "献": 1007, "裸": 1008, "痛": 1009, "鳥": 1010, "綻": 1011, "錠": 1012, "純": 1013, "寡": 1014, "唄": 1015, "隆": 1016, "峰": 1017, "欲": 1018, "怨": 1019, "長": 1020, "鎖": 1021, "怪": 1022, "た": 1023, "嫌": 1024, "昆": 1025, "緯": 1026, "撮": 1027, "費": 1028, "香": 1029, "意": 1030, "永": 1031, "坂": 1032, "看": 1033, "台": 1034, "綺": 1035, "廃": 1036, "な": 1037, "杏": 1038, "覚": 1039, "眩": 1040, "東": 1041, "願": 1042, "濃": 1043, "芳": 1044, "造": 1045, "九": 1046, "橋": 1047, "伺": 1048, "楷": 1049, "誘": 1050, "頓": 1051, "娠": 1052, "子": 1053, "祝": 1054, "ヘ": 1055, "絡": 1056, "縦": 1057, "版": 1058, "霧": 1059, "渇": 1060, "型": 1061, "曇": 1062, "ほ": 1063, "級": 1064, "潔": 1065, "固": 1066, "肩": 1067, "椅": 1068, "深": 1069, "ユ": 1070, "把": 1071, "生": 1072, "恥": 1073, "塩": 1074, "賀": 1075, "否": 1076, "っ": 1077, "メ": 1078, "儲": 1079, "頼": 1080, "算": 1081, "う": 1082, "会": 1083, "乞": 1084, "徳": 1085, "冒": 1086, "盟": 1087, "猶": 1088, "駄": 1089, "気": 1090, "ぼ": 1091, "妓": 1092, "味": 1093, "暦": 1094, "鍔": 1095, "悪": 1096, "棺": 1097, "掻": 1098, "紡": 1099, "箸": 1100, "行": 1101, "偵": 1102, "仰": 1103, "容": 1104, "暫": 1105, "懸": 1106, "猿": 1107, "係": 1108, "紅": 1109, "伏": 1110, "粉": 1111, "蛮": 1112, "煙": 1113, "某": 1114, "膳": 1115, "劇": 1116, "族": 1117, "墟": 1118, "心": 1119, "苑": 1120, "災": 1121, "猫": 1122, "糊": 1123, "姜": 1124, "凸": 1125, "遜": 1126, "ズ": 1127, "歓": 1128, "城": 1129, "擁": 1130, "酬": 1131, "累": 1132, "境": 1133, "遺": 1134, "射": 1135, "月": 1136, "稽": 1137, "尖": 1138, "宙": 1139, "臣": 1140, "暇": 1141, "停": 1142, "繋": 1143, "苛": 1144, "刹": 1145, "犯": 1146, "旗": 1147, "典": 1148, "狩": 1149, "踪": 1150, "茜": 1151, "祷": 1152, "漸": 1153, "菱": 1154, "岡": 1155, "姿": 1156, "疑": 1157, "勤": 1158, "薪": 1159, "寂": 1160, "政": 1161, "英": 1162, "闘": 1163, "賂": 1164, "鉤": 1165, "態": 1166, "腺": 1167, "鰭": 1168, "乙": 1169, "僕": 1170, "謎": 1171, "捜": 1172, "含": 1173, "顰": 1174, "篤": 1175, "鑑": 1176, "頻": 1177, "与": 1178, "変": 1179, "凍": 1180, "躊": 1181, "圭": 1182, "ブ": 1183, "躍": 1184, "ぶ": 1185, "浦": 1186, "編": 1187, "瞭": 1188, "爛": 1189, "催": 1190, "砲": 1191, "径": 1192, "曰": 1193, "袖": 1194, "蔽": 1195, "潰": 1196, "音": 1197, "臨": 1198, "ネ": 1199, "彩": 1200, "撤": 1201, "妄": 1202, "競": 1203, "陸": 1204, "独": 1205, "ボ": 1206, "脂": 1207, "陳": 1208, "護": 1209, "段": 1210, "虞": 1211, "搭": 1212, "志": 1213, "槍": 1214, "具": 1215, "逆": 1216, "轄": 1217, "葡": 1218, "も": 1219, "傘": 1220, "契": 1221, "傲": 1222, "説": 1223, "喉": 1224, "依": 1225, "凡": 1226, "副": 1227, "鍵": 1228, "陛": 1229, "峻": 1230, "蒙": 1231, "健": 1232, "去": 1233, "辛": 1234, "彙": 1235, "身": 1236, "髄": 1237, "舌": 1238, "位": 1239, "濁": 1240, "索": 1241, "辿": 1242, "件": 1243, "浸": 1244, "紀": 1245, "早": 1246, "聡": 1247, "汝": 1248, "羅": 1249, "ノ": 1250, "で": 1251, "賭": 1252, "勅": 1253, "塚": 1254, "球": 1255, "横": 1256, "ス": 1257, "邸": 1258, "セ": 1259, "模": 1260, "是": 1261, "署": 1262, "甲": 1263, "ウ": 1264, "遮": 1265, "才": 1266, "錬": 1267, "泡": 1268, "足": 1269, "督": 1270, "く": 1271, "聞": 1272, "陪": 1273, "医": 1274, "卒": 1275, "創": 1276, "先": 1277, "扈": 1278, "州": 1279, "披": 1280, "鼓": 1281, "在": 1282, "寛": 1283, "鱗": 1284, "麒": 1285, "麺": 1286, "炭": 1287, "玄": 1288, "幅": 1289, "救": 1290, "差": 1291, "肝": 1292, "弁": 1293, "鳶": 1294, "吊": 1295, "餐": 1296, "活": 1297, "師": 1298, "風": 1299, "閥": 1300, "溝": 1301, "互": 1302, "成": 1303, "嗜": 1304, "着": 1305, "洗": 1306, "双": 1307, "詮": 1308, "贅": 1309, "滴": 1310, "堅": 1311, "刷": 1312, "航": 1313, "屍": 1314, "翼": 1315, "滋": 1316, "室": 1317, "助": 1318, "朋": 1319, "廷": 1320, "探": 1321, "借": 1322, "峠": 1323, "蜜": 1324, "漏": 1325, "正": 1326, "放": 1327, "巨": 1328, "薔": 1329, "領": 1330, "潮": 1331, "到": 1332, "様": 1333, "患": 1334, "信": 1335, "罅": 1336, "疫": 1337, "用": 1338, "浪": 1339, "畔": 1340, "蓋": 1341, "坦": 1342, "嶋": 1343, "伎": 1344, "蛛": 1345, "建": 1346, "勘": 1347, "腱": 1348, "雰": 1349, "租": 1350, "考": 1351, "褒": 1352, "代": 1353, "杖": 1354, "腸": 1355, "嘩": 1356, "黄": 1357, "緩": 1358, "宇": 1359, "適": 1360, "砦": 1361, "哺": 1362, "宜": 1363, "迎": 1364, "鍛": 1365, "婦": 1366, "べ": 1367, "惜": 1368, "乾": 1369, "憐": 1370, "且": 1371, "来": 1372, "氏": 1373, "よ": 1374, "援": 1375, "と": 1376, "阻": 1377, "墳": 1378, "廊": 1379, "緊": 1380, "淡": 1381, "雇": 1382, "欄": 1383, "週": 1384, "鶴": 1385, "龍": 1386, "広": 1387, "呆": 1388, "桁": 1389, "文": 1390, "炸": 1391, "誕": 1392, "祭": 1393, "趣": 1394, "飯": 1395, "堵": 1396, "裂": 1397, "鎌": 1398, "受": 1399, "飲": 1400, "鳩": 1401, "帆": 1402, "未": 1403, "ケ": 1404, "箪": 1405, "お": 1406, "岸": 1407, "嘘": 1408, "傾": 1409, "遷": 1410, "薦": 1411, "百": 1412, "焼": 1413, "伍": 1414, "氷": 1415, "堺": 1416, "叫": 1417, "棄": 1418, "障": 1419, "武": 1420, "ぽ": 1421, "安": 1422, "厄": 1423, "員": 1424, "垣": 1425, "騒": 1426, "丘": 1427, "郵": 1428, "誇": 1429, "ポ": 1430, "聊": 1431, "狙": 1432, "葛": 1433, "拷": 1434, "維": 1435, "旅": 1436, "楕": 1437, "釜": 1438, "酸": 1439, "痴": 1440, "間": 1441, "フ": 1442, "群": 1443, "波": 1444, "滲": 1445, "縮": 1446, "摩": 1447, "顔": 1448, "カ": 1449, "鬘": 1450, "藻": 1451, "仲": 1452, "促": 1453, "ぐ": 1454, "癇": 1455, "姉": 1456, "堪": 1457, "飛": 1458, "ゅ": 1459, "凝": 1460, "祉": 1461, "訃": 1462, "久": 1463, "巻": 1464, "礎": 1465, "官": 1466, "牲": 1467, "艦": 1468, "鈴": 1469, "客": 1470, "究": 1471, "績": 1472, "謁": 1473, "勇": 1474, "盗": 1475, "届": 1476, "熊": 1477, "致": 1478, "ペ": 1479, "商": 1480, "又": 1481, "朽": 1482, "阪": 1483, "喰": 1484, "琴": 1485, "旨": 1486, "値": 1487, "晶": 1488, "臓": 1489, "夕": 1490, "組": 1491, "他": 1492, "住": 1493, "暴": 1494, "融": 1495, "相": 1496, "害": 1497, "餌": 1498, "栓": 1499, "針": 1500, "ん": 1501, "遵": 1502, "森": 1503, "渦": 1504, "慈": 1505, "ェ": 1506, "払": 1507, "鵬": 1508, "閉": 1509, "乱": 1510, "離": 1511, "満": 1512, "勢": 1513, "拙": 1514, "幾": 1515, "戊": 1516, "崩": 1517, "弧": 1518, "板": 1519, "媛": 1520, "胡": 1521, "ヨ": 1522, "高": 1523, "ず": 1524, "憚": 1525, "床": 1526, "享": 1527, "躯": 1528, "涯": 1529, "丈": 1530, "閣": 1531, "庸": 1532, "荒": 1533, "畏": 1534, "噌": 1535, "博": 1536, "薯": 1537, "堂": 1538, "槽": 1539, "曽": 1540, "鯨": 1541, "劣": 1542, "鎮": 1543, "浄": 1544, "紙": 1545, "汽": 1546, "不": 1547, "調": 1548, "道": 1549, "販": 1550, "吏": 1551, "還": 1552, "准": 1553, "符": 1554, "鰻": 1555, "偉": 1556, "拭": 1557, "笠": 1558, "何": 1559, "ン": 1560, "賢": 1561, "善": 1562, "快": 1563, "祠": 1564, "ナ": 1565, "き": 1566, "書": 1567, "諺": 1568, "盛": 1569, "塁": 1570, "達": 1571, "尾": 1572, "薩": 1573, "問": 1574, "情": 1575, "罠": 1576, "誌": 1577, "般": 1578, "内": 1579, "て": 1580, "鷹": 1581, "毎": 1582, "棒": 1583, "栗": 1584, "及": 1585, "幌": 1586, "誤": 1587, "公": 1588, "橘": 1589, "餓": 1590, "漿": 1591, "欣": 1592, "愛": 1593, "目": 1594, "巣": 1595, "象": 1596, "奇": 1597, "ゆ": 1598, "取": 1599, "述": 1600, "郭": 1601, "厭": 1602, "療": 1603, "粗": 1604, "米": 1605, "罰": 1606, "謙": 1607, "詩": 1608, "グ": 1609, "就": 1610, "透": 1611, "ョ": 1612, "浅": 1613, "各": 1614, "罵": 1615, "検": 1616, "可": 1617, "ぷ": 1618, "つ": 1619, "奈": 1620, "福": 1621, "忠": 1622, "挟": 1623, "ォ": 1624, "リ": 1625, "征": 1626, "彫": 1627, "せ": 1628, "党": 1629, "刺": 1630, "拒": 1631, "巡": 1632, "頑": 1633, "己": 1634, "北": 1635, "外": 1636, "扁": 1637, "船": 1638, "掌": 1639, "胎": 1640, "燗": 1641, "株": 1642, "婿": 1643, "仙": 1644, "表": 1645, "華": 1646, "遂": 1647, "麟": 1648, "挫": 1649, "瑠": 1650, "喪": 1651, "夷": 1652, "絶": 1653, "直": 1654, "難": 1655, "蔑": 1656, "逓": 1657, "堆": 1658, "斑": 1659, "孫": 1660, "観": 1661, "海": 1662, "鐘": 1663, "乃": 1664, "再": 1665, "平": 1666, "戸": 1667, "嵐": 1668, "系": 1669, "渓": 1670, "評": 1671, "絆": 1672, "殺": 1673, "鞄": 1674, "布": 1675, "虚": 1676, "諧": 1677, "絨": 1678, "拘": 1679, "斉": 1680, "剣": 1681, "力": 1682, "淵": 1683, "尉": 1684, "奴": 1685, "労": 1686, "こ": 1687, "紫": 1688, "戚": 1689, "午": 1690, "温": 1691, "篭": 1692, "筒": 1693, "閑": 1694, "衡": 1695, "饅": 1696, "口": 1697, "ト": 1698, "酢": 1699, "劾": 1700, "植": 1701, "準": 1702, "轟": 1703, "衷": 1704, "硫": 1705, "諮": 1706, "慕": 1707, "湧": 1708, "嘔": 1709, "県": 1710, "斤": 1711, "程": 1712, "街": 1713, "睦": 1714, "計": 1715, "利": 1716, "徹": 1717, "地": 1718, "拠": 1719, "仕": 1720, "試": 1721, "蓮": 1722, "擬": 1723, "呪": 1724, "治": 1725, "魚": 1726, "輩": 1727, "失": 1728, "虹": 1729, "星": 1730, "翻": 1731, "げ": 1732, "笹": 1733, "ア": 1734, "は": 1735, "喧": 1736, "譲": 1737, "刃": 1738, "訂": 1739, "ょ": 1740, "朗": 1741, "流": 1742, "猟": 1743, "み": 1744, "棋": 1745, "努": 1746, "兆": 1747, "雷": 1748, "匂": 1749, "胴": 1750, "映": 1751, "抒": 1752, "三": 1753, "普": 1754, "疵": 1755, "措": 1756, "芽": 1757, "異": 1758, "査": 1759, "颯": 1760, "憎": 1761, "皆": 1762, "于": 1763, "配": 1764, "順": 1765, "惨": 1766, "棟": 1767, "缶": 1768, "鼻": 1769, "摘": 1770, "預": 1771, "扱": 1772, "桜": 1773, "夢": 1774, "堀": 1775, "笑": 1776, "候": 1777, "担": 1778, "屠": 1779, "憲": 1780, "怒": 1781, "清": 1782, "鞭": 1783, "漁": 1784, "原": 1785, "宅": 1786, "状": 1787, "守": 1788, "拍": 1789, "察": 1790, "わ": 1791, "悟": 1792, "蔭": 1793, "赦": 1794, "持": 1795, "赴": 1796, "燥": 1797, "瞳": 1798, "ク": 1799, "沃": 1800, "分": 1801, "汰": 1802, "者": 1803, "火": 1804, "逃": 1805, "置": 1806, "暖": 1807, "溶": 1808, "欧": 1809, "紹": 1810, "背": 1811, "陀": 1812, "秘": 1813, "下": 1814, "票": 1815, "悔": 1816, "事": 1817, "尋": 1818, "飢": 1819, "昨": 1820, "民": 1821, "干": 1822, "侵": 1823, "有": 1824, "痘": 1825, "引": 1826, "脹": 1827, "沿": 1828, "承": 1829, "枯": 1830, "斜": 1831, "警": 1832, "兵": 1833, "出": 1834, "濫": 1835, "欠": 1836, "認": 1837, "押": 1838, "専": 1839, "留": 1840, "屯": 1841, "節": 1842, "飾": 1843, "蝶": 1844, "銘": 1845, "獄": 1846, "宛": 1847, "扉": 1848, "壱": 1849, "騙": 1850, "済": 1851, "熟": 1852, "懲": 1853, "緑": 1854, "課": 1855, "酵": 1856, "躙": 1857, "墓": 1858, "玩": 1859, "蛇": 1860, "奉": 1861, "噴": 1862, "庶": 1863, "遇": 1864, "洋": 1865, "日": 1866, "架": 1867, "哨": 1868, "序": 1869, "汁": 1870, "倉": 1871, "勉": 1872, "ぉ": 1873, "ヒ": 1874, "暮": 1875, "藤": 1876, "拾": 1877, "控": 1878, "鮫": 1879, "策": 1880, "械": 1881, "穏": 1882, "蝦": 1883, "微": 1884, "瓜": 1885, "聖": 1886, "洞": 1887, "胆": 1888, "暢": 1889, "菌": 1890, "牌": 1891, "左": 1892, "司": 1893, "免": 1894, "讃": 1895, "貯": 1896, "権": 1897, "跳": 1898, "握": 1899, "苔": 1900, "宦": 1901, "違": 1902, "滅": 1903, "多": 1904, "哲": 1905, "姪": 1906, "蚕": 1907, "ゾ": 1908, "貞": 1909, "竜": 1910, "塾": 1911, "名": 1912, "告": 1913, "総": 1914, "曜": 1915, "窃": 1916, "詔": 1917, "ろ": 1918, "仮": 1919, "飄": 1920, "顎": 1921, "谷": 1922, "里": 1923, "饒": 1924, "壕": 1925, "従": 1926, "け": 1927, "実": 1928, "痕": 1929, "諸": 1930, "陵": 1931, "沢": 1932, "羞": 1933, "斎": 1934, "由": 1935, "鴉": 1936, "杯": 1937, "兜": 1938, "愁": 1939, "館": 1940, "憑": 1941, "層": 1942, "雑": 1943, "業": 1944, "貼": 1945, "重": 1946, "営": 1947, "迭": 1948, "荘": 1949, "主": 1950, "喝": 1951, "奨": 1952, "ホ": 1953, "帯": 1954, "令": 1955, "冥": 1956, "繍": 1957, "水": 1958, "話": 1959, "ら": 1960, "樫": 1961, "金": 1962, "傍": 1963, "枠": 1964, "棚": 1965, "必": 1966, "硝": 1967, "虫": 1968, "虜": 1969, "珊": 1970, "命": 1971, "止": 1972, "ビ": 1973, "加": 1974, "錨": 1975, "礬": 1976, "鱒": 1977, "昇": 1978, "肺": 1979, "辱": 1980, "募": 1981, "祐": 1982, "南": 1983, "求": 1984, "旺": 1985, "一": 1986, "角": 1987, "り": 1988, "掛": 1989, "寅": 1990, "均": 1991, "攀": 1992, "芸": 1993, "紋": 1994, "厠": 1995, "六": 1996, "衝": 1997, "几": 1998, "当": 1999, "寸": 2000, "超": 2001, "炉": 2002, "断": 2003, "燃": 2004, "夜": 2005, "似": 2006, "毬": 2007, "頭": 2008, "念": 2009, "皇": 2010, "材": 2011, "惑": 2012, "そ": 2013, "房": 2014, "シ": 2015, "朝": 2016, "叱": 2017, "舞": 2018, "締": 2019, "約": 2020, "黙": 2021, "漠": 2022, "董": 2023, "希": 2024, "酌": 2025, "禿": 2026, "沸": 2027, "雅": 2028, "ヤ": 2029, "鋳": 2030, "製": 2031, "軟": 2032, "進": 2033, "茅": 2034, "窩": 2035, "挙": 2036, "輝": 2037, "舎": 2038, "発": 2039, "肴": 2040, "臼": 2041, "叙": 2042, "婚": 2043, "洩": 2044, "鷲": 2045, "康": 2046, "唸": 2047, "基": 2048, "眈": 2049, "枡": 2050, "掴": 2051, "潟": 2052, "保": 2053, "蜂": 2054, "鬼": 2055, "瓦": 2056, "万": 2057, "諏": 2058, "腐": 2059, "遊": 2060, "糾": 2061, "拓": 2062, "初": 2063, "唯": 2064, "迅": 2065, "膝": 2066, "聳": 2067, "か": 2068, "寄": 2069, "果": 2070, "舛": 2071, "摂": 2072, "冠": 2073, "翌": 2074, "素": 2075, "帥": 2076, "倍": 2077, "狼": 2078, "稿": 2079, "柳": 2080, "休": 2081, "補": 2082, "銭": 2083, "歌": 2084, "爪": 2085, "陶": 2086, "凹": 2087, "衰": 2088, "賤": 2089, "袍": 2090, "港": 2091, "移": 2092, "ひ": 2093, "粋": 2094, "防": 2095, "禅": 2096, "レ": 2097, "予": 2098, "阿": 2099, "抜": 2100, "寿": 2101, "罪": 2102, "捻": 2103, "撚": 2104, "垢": 2105, "坊": 2106, "磨": 2107, "卜": 2108, "頬": 2109, "塀": 2110, "繊": 2111, "珀": 2112, "見": 2113, "迫": 2114, "乳": 2115, "択": 2116, "争": 2117, "渉": 2118, "哉": 2119, "撒": 2120, "毛": 2121, "坑": 2122, "器": 2123, "茎": 2124, "使": 2125, "揃": 2126, "混": 2127, "憩": 2128, "焦": 2129, "影": 2130, "社": 2131, "虎": 2132, "徐": 2133, "駐": 2134, "沈": 2135, "い": 2136, "豪": 2137, "鉢": 2138, "銃": 2139, "隷": 2140, "範": 2142, "賽": 2143, "連": 2144, "灼": 2145, "軌": 2146, "崎": 2147, "幹": 2148, "儀": 2149, "蝉": 2150, "朱": 2151, "次": 2152, "託": 2153, "ガ": 2154, "露": 2155, "第": 2156, "欺": 2157, "綱": 2158, "降": 2159, "瞞": 2160, "央": 2161, "竈": 2162, "所": 2163, "科": 2164, "秩": 2165, "妬": 2166, "遍": 2167, "辣": 2168, "娼": 2169, "験": 2170, "響": 2171, "攫": 2172, "頤": 2173, "繕": 2174, "育": 2175, "籠": 2176, "疲": 2177, "頚": 2178, "貢": 2179, "僧": 2180, "贈": 2181, "楽": 2182, "殴": 2183, "写": 2184, "空": 2185, "嘆": 2186, "錐": 2187, "娯": 2188, "抑": 2189, "若": 2190, "例": 2191, "款": 2192, "規": 2193, "蔵": 2194, "季": 2195, "局": 2196, "敵": 2197, "丞": 2198, "面": 2199, "美": 2200, "迷": 2201, "居": 2202, "展": 2203, "揺": 2204, "帳": 2205, "癌": 2206, "鉦": 2207, "君": 2208, "姓": 2209, "答": 2210, "錘": 2211, "完": 2212, "窒": 2213, "慌": 2214, "珠": 2215, "逸": 2216, "批": 2217, "膜": 2218, "江": 2219, "提": 2220, "眠": 2221, "鏡": 2222, "教": 2223, "簡": 2224, "単": 2225, "憂": 2226, "即": 2227, "駒": 2228, "屁": 2229, "鈍": 2230, "ぎ": 2231, "画": 2232, "枝": 2233, "獅": 2234, "弐": 2235, "望": 2236, "搾": 2237, "損": 2238, "木": 2239, "沼": 2240, "粧": 2241, "酔": 2242, "挑": 2243, "卵": 2244, "懺": 2245, "審": 2246, "詳": 2247, "判": 2248, "滑": 2249, "蛍": 2250, "丁": 2251, "友": 2252, "町": 2253, "刀": 2254, "歯": 2255, "餃": 2256, "鯉": 2257, "復": 2258, "以": 2259, "散": 2260, "撃": 2261, "縁": 2262, "誰": 2263, "マ": 2264, "号": 2265, "灘": 2266, "個": 2267, "饉": 2268, "殆": 2269, "土": 2270, "賃": 2271, "禍": 2272, "偶": 2273, "扶": 2274, "窮": 2275, "抽": 2276, "孝": 2277, "花": 2278, "島": 2279, "跡": 2280, "祀": 2281, "肌": 2282, "賊": 2283, "丹": 2284, "式": 2285, "捧": 2286, "逝": 2287, "克": 2288, "采": 2289, "訪": 2290, "餅": 2291, "既": 2292, "服": 2293, "罷": 2294, "価": 2295, "ギ": 2296, "時": 2297, "ヴ": 2298, "淑": 2299, "開": 2300, "ッ": 2301, "寥": 2302, "柿": 2303, "漬": 2304, "弦": 2305, "図": 2306, "毀": 2307, "悠": 2308, "敬": 2309, "紛": 2310, "豊": 2311, "ば": 2312, "修": 2313, "伴": 2314, "磯": 2315, "定": 2316, "続": 2317, "凛": 2318, "隙": 2319, "鹿": 2320, "杳": 2321, "嚇": 2322, "声": 2323, "諭": 2324, "頸": 2325, "氣": 2326, "柵": 2327, "厚": 2328, "魔": 2329, "幼": 2330, "琉": 2331, "践": 2332, "煎": 2333, "肘": 2334, "確": 2335, "処": 2336, "穫": 2337, "剪": 2338, "囲": 2339, "骨": 2340, "柱": 2341, "走": 2342, "冬": 2343, "侍": 2344, "粒": 2345, "減": 2346, "錦": 2347, "股": 2348, "モ": 2349, "秀": 2350, "鞍": 2351, "却": 2352, "郷": 2353, "椒": 2354, "弾": 2355, "阜": 2356, "廂": 2357, "剰": 2358, "退": 2359, "革": 2360, "圧": 2361, "憺": 2362, "膨": 2363, "遥": 2364, "し": 2365, "換": 2366, "臆": 2367, "恒": 2368, "富": 2369, "亀": 2370, "捉": 2371, "惧": 2372, "反": 2373, "栽": 2374, "ミ": 2375, "煉": 2376, "噂": 2377, "漱": 2378, "貌": 2379, "慄": 2380, "竹": 2381, "滝": 2382, "溢": 2383, "恢": 2384, "廻": 2385, "忙": 2386, "雛": 2387, "菓": 2388, "璧": 2389, "茨": 2390, "抗": 2391, "執": 2392, "蝕": 2393, "題": 2394, "札": 2395, "川": 2396, "的": 2397, "訳": 2398, "付": 2399, "稲": 2400, "脇": 2401, "埃": 2402, "笛": 2403, "挿": 2404, "擦": 2405, "衣": 2406, "デ": 2407, "盲": 2408, "妖": 2409, "網": 2410, "幻": 2411, "塞": 2412, "楼": 2413, "綾": 2414, "娘": 2415, "踊": 2416, "殊": 2417, "怖": 2418, "煽": 2419, "弔": 2420, "削": 2421, "秒": 2422, "耗": 2423, "雀": 2424, "際": 2425, "弟": 2426, "蘇": 2427, "石": 2428, "甘": 2429, "今": 2430, "辞": 2431, "掟": 2432, "舶": 2433, "篩": 2434, "冊": 2435, "釣": 2436, "柄": 2437, "嚥": 2438, "非": 2439, "妃": 2440, "傑": 2441, "須": 2442, "韮": 2443, "得": 2444, "宵": 2445, "能": 2446, "西": 2447, "��": 2448, "余": 2449, "恋": 2450, "豹": 2451, "斥": 2452, "揚": 2453, "菜": 2454, "胞": 2455, "毯": 2456, "遭": 2457, "晰": 2458, "扮": 2459, "病": 2460, "寧": 2461, "諾": 2462, "泳": 2463, "自": 2464, "誉": 2465, "洒": 2466, "錯": 2467, "歪": 2468, "迦": 2469, "賄": 2470, "嚢": 2471, "裁": 2472, "羹": 2473, "昏": 2474, "后": 2475, "字": 2476, "向": 2477, "亜": 2478, "灌": 2479, "ニ": 2480, "種": 2481, "々": 2482, "団": 2483, "低": 2484, "杉": 2485, "湿": 2486, "脚": 2487, "弥": 2488, "宗": 2489, "托": 2490, "吸": 2491, "根": 2492, "屈": 2493, "升": 2494, "軽": 2495, "獣": 2496, "彰": 2497, "危": 2498, "悼": 2499, "励": 2500, "檎": 2501, "劉": 2502, "込": 2503, "訴": 2504, "箔": 2505, "脅": 2506, "蒸": 2507, "嵌": 2508, "ふ": 2509, "芋": 2510, "貧": 2511, "溺": 2512, "脆": 2513, "奔": 2514, "倫": 2515, "纏": 2516, "田": 2517, "之": 2518, "炎": 2519, "五": 2520, "鵜": 2521, "髣": 2522, "曹": 2523, "突": 2524, "賜": 2525, "姦": 2526, "委": 2527, "常": 2528, "眼": 2529, "末": 2530, "継": 2531, "過": 2532, "鉾": 2533, "戒": 2534, "嫡": 2535, "場": 2536, "姫": 2537, "鮮": 2538, "整": 2539, "耳": 2540, "王": 2541, "潤": 2542, "胸": 2543, "喋": 2544, "蚊": 2545, "簀": 2546, "飼": 2547, "憧": 2548, "料": 2549, "尚": 2550, "肉": 2551, "易": 2552, "ぜ": 2553, "標": 2554, "隻": 2555, "肯": 2556, "蜘": 2557, "ゴ": 2558, "む": 2559, "勃": 2560, "洪": 2561, "宋": 2562, "め": 2563, "塊": 2564, "匹": 2565, "後": 2566, "貪": 2567, "隊": 2568, "咲": 2569, "池": 2570, "府": 2571, "チ": 2572, "招": 2573, "麓": 2574, "許": 2575, "渥": 2576, "嬉": 2577, "閃": 2578, "辺": 2579, "添": 2580, "症": 2581, "壁": 2582, "幕": 2583, "偏": 2584, "魂": 2585, "憶": 2586, "ヶ": 2587, "塹": 2588, "吉": 2589, "鍬": 2590, "飴": 2591, "士": 2592, "狐": 2593, "蹴": 2594, "券": 2595, "養": 2596, "惰": 2597, "ル": 2598, "好": 2599, "碁": 2600, "食": 2601, "縛": 2602, "集": 2603, "七": 2604, "運": 2605, "黒": 2606, "据": 2607, "舷": 2608, "寵": 2609, "卿": 2610, "ベ": 2611, "吹": 2612, "浮": 2613, "功": 2614, "鍋": 2615, "嫉": 2616, "坐": 2617, "青": 2618, "財": 2619, "馬": 2620, "条": 2621, "管": 2622, "仏": 2623, "塗": 2624, "都": 2625, "八": 2626, "没": 2627, "氾": 2628, "萎": 2629, "泉": 2630, "靴": 2631, "熔": 2632, "櫓": 2633, "松": 2634, "尿": 2635, "況": 2636, "敷": 2637, "泥": 2638, "盤": 2639, "玉": 2640, "梨": 2641, "剛": 2642, "麻": 2643, "畿": 2644, "骸": 2645, "落": 2646, "周": 2647, "桑": 2648, "謄": 2649, "揮": 2650, "旦": 2651, "勧": 2652, "逐": 2653, "優": 2654, "粘": 2655, "度": 2656, "バ": 2657, "頂": 2658, "蟲": 2659, "議": 2660, "ぞ": 2661, "始": 2662, "触": 2663, "聴": 2664, "詫": 2665, "年": 2666, "柑": 2667, "憾": 2668, "祟": 2669, "輸": 2670, "陣": 2671, "児": 2672, "接": 2673, "畑": 2674, "属": 2675, "記": 2676, "隔": 2677, "伸": 2678, "剤": 2679, "産": 2680, "印": 2681, "テ": 2682, "昼": 2683, "烈": 2684, "套": 2685, "井": 2686, "肢": 2687, "筋": 2688, "酷": 2689, "遡": 2690, "覆": 2691, "白": 2692, "祖": 2693, "幣": 2694, "箱": 2695, "激": 2696, "|": 2141, "[UNK]": 2697, "[PAD]": 2698}
|