update model

Browse files

Files changed (3) hide show

README.md +32 -12
pytorch_model.bin +2 -2
vocab.json +1 -1

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ datasets:
 - common_voice
 metrics:
 - wer
 tags:
 - audio
 - automatic-speech-recognition
@@ -23,12 +24,15 @@ model-index:
     metrics:
        - name: Test WER
          type: wer
-         value: 34.49
 ---
 # Wav2Vec2-Large-XLSR-53-Hungarian
-Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Hungarian using the [Common Voice](https://huggingface.co/datasets/common_voice).
 When using this model, make sure that your speech input is sampled at 16kHz.
 The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
@@ -45,8 +49,9 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "hu"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
-test_dataset = load_dataset("common_voice", LANG_ID, split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
@@ -60,21 +65,31 @@ def speech_file_to_array_fn(batch):
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
-inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
     logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
-print("Prediction:", processor.batch_decode(predicted_ids))
-print("Reference:", test_dataset[:2]["sentence"])
 ```
 ## Evaluation
-The model can be evaluated as follows on the hungarian test data of Common Voice.
 ```python
 import torch
@@ -87,12 +102,13 @@ LANG_ID = "hu"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
 DEVICE = "cuda"
-CHARS_TO_IGNORE = [",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "ʿ", "·", "჻", "¿", "¡", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
-                   "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
-wer = load_metric("wer")
 chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
@@ -124,7 +140,11 @@ def evaluate(batch):
 result = test_dataset.map(evaluate, batched=True, batch_size=32)
-print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
-**Test Result**: 34.49%

 - common_voice
 metrics:
 - wer
+- cer
 tags:
 - audio
 - automatic-speech-recognition
     metrics:
        - name: Test WER
          type: wer
+         value: 31.40
+       - name: Test CER
+         type: cer
+         value: 10.49
 ---
 # Wav2Vec2-Large-XLSR-53-Hungarian
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Hungarian using the [Common Voice](https://huggingface.co/datasets/common_voice) and [CSS10](https://github.com/Kyubyong/css10).
 When using this model, make sure that your speech input is sampled at 16kHz.
 The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
 LANG_ID = "hu"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
+SAMPLES = 5
+test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
+inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
     logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
+predicted_sentences = processor.batch_decode(predicted_ids)
+for i, predicted_sentence in enumerate(predicted_sentences):
+    print("-" * 100)
+    print("Reference:", test_dataset[i]["sentence"])
+    print("Prediction:", predicted_sentence)
 ```
+| Reference  | Prediction |
+| ------------- | ------------- |
+| BÜSZKÉK VAGYUNK A MAGYAR EMBEREK NAGYSZERŰ SZELLEMI ALKOTÁSAIRA. | BÜSZKÉK VAGYUNK A MAGYAR EMBEREK NAGYSZERŰ SZELLEMI ALKOTÁSAIRE |
+| A NEMZETSÉG TAGJAI KÖZÜL EZT TERMESZTIK A LEGSZÉLESEBB KÖRBEN ÍZLETES TERMÉSÉÉRT. | A NEMZETSÉG TAGJAI KÖZÜL ESZSZERMESZTIK A LEGSZELESEBB KÖRBEN IZLETES TERMÉSSÉÉRT |
+| A VÁROSBA VÁGYÓDOTT A LEGJOBBAN, ÉPPEN MERT ODA NEM JUTHATOTT EL SOHA. | A VÁROSBA VÁGYÓDOTT A LEGJOBBAN ÉPPEN MERT ODA NEM JUTHATOTT EL SOHA |
+| SÍRJA MÁRA MEGSEMMISÜLT. | SIMGI A MANDO MEG SEMMICSEN |
+| MINDEN ZENESZÁMOT DRÁGAKŐNEK NEVEZETT. | MINDEN ZENA SZÁMODRAGAKŐNEK NEVEZETT |
 ## Evaluation
+The model can be evaluated as follows on the Hungarian test data of Common Voice.
 ```python
 import torch
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian"
 DEVICE = "cuda"
+CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
+                   "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
+wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
+cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
 chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
 result = test_dataset.map(evaluate, batched=True, batch_size=32)
+print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
+print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
 ```
+**Test Result**:
+- WER: 31.40%
+- CER: 10.49%

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:387a71ad34db3306482b4a56141da584923a251b72d05f8844c32eca14d3340a
-size 1262097815

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ab6510ff7c1c59ff751c63047eba527956df889284321f87a73cbf322012932
+size 1262101911

vocab.json CHANGED Viewed

@@ -1 +1 @@

- {"Q": 0, "C": 1, "~~\u00cd~~": 2, "~~\u00c9~~": 3, "~~\u00da~~": 4, "X": 5, "P": 6, "S": 7, "M": 8, "G": 9, "~~\u00dc~~": 10, "~~\u00c1~~": 11, "~~\u00d6~~": 12, "Y": 13, "J": 14, "O": 15, "H": 16, "Z": 17, "V": 18, "L": 19, "W": 20, "I": 21, "~~\u00d3~~": 22, "E": 23, "K": 24, "B": 25, "F": 26, "A": 27, "N": 28, "~~\u0170~~": 29, "R": 30, "D": 31, "~~\u0150~~": 32, "U": 34, "T": 35, "|": 33, "~~<unk>~~": 36, "~~<pad>~~": 37, "~~<s>~~": 38, "~~</s>~~": 39}


1	+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "\|": 4, "J": 5, "Ű": 6, "G": 7, "Y": 8, "Á": 9, "L": 10, "Ü": 11, "H": 12, "V": 13, "É": 14, "A": 15, "P": 16, "C": 17, "M": 18, "Q": 19, "-": 20, "Ú": 21, "K": 22, "D": 23, "Ő": 24, "Ó": 25, "R": 26, "W": 27, "N": 28, "B": 29, "X": 30, "Í": 31, "S": 32, "O": 33, "F": 34, "T": 35, "Z": 36, "U": 37, "E": 38, "I": 39, "Ö": 40}