File size: 5,333 Bytes
0a45ee6 2bb294a 0a45ee6 0700f4f fe26a2e f73d55f 0a45ee6 fe26a2e c733de8 fe26a2e 0700f4f fe26a2e 0700f4f fe26a2e 0700f4f 0a45ee6 e4361b5 c733de8 e4361b5 b5605e6 ecf47c2 c733de8 ecf47c2 e4361b5 c733de8 e4361b5 0a45ee6 e4361b5 0a45ee6 e4361b5 0a45ee6 c733de8 0a45ee6 e4361b5 0a45ee6 e4361b5 97cd4e4 e4361b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
---
language: zh-HK
datasets:
- common_voice
tags:
- audio
- automatic-speech-recognition
- hf-asr-leaderboard
- robust-speech-event
- speech
- xlsr-fine-tuning-week
license: apache-2.0
model-index:
- name: XLSR Wav2Vec2 Cantonese (Hong Kong) by Voidful
results:
- task:
name: Speech Recognition
type: automatic-speech-recognition
dataset:
name: Common Voice zh-HK
type: common_voice
args: zh-HK
metrics:
- name: Test CER
type: cer
value: 16.41
---
# Wav2Vec2-Large-XLSR-53-hk
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Cantonese using the [Common Voice](https://huggingface.co/datasets/common_voice).
When using this model, make sure that your speech input is sampled at 16kHz.
## Usage
[Colab trial](https://colab.research.google.com/drive/1nBRLf4Pwiply_y5rXWoaIB8LxX41tfEI?usp=sharing)
```
import torchaudio
from datasets import load_dataset, load_metric
from transformers import (
Wav2Vec2ForCTC,
Wav2Vec2Processor,
)
import torch
import re
import sys
model_name = "voidful/wav2vec2-large-xlsr-53-hk"
device = "cuda"
processor_name = "voidful/wav2vec2-large-xlsr-53-hk"
chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\\"#$%&()*+,\\-.\\:;<=>?@\\[\\]\\\\\\/^_`{|}~]"
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2Processor.from_pretrained(processor_name)
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
def load_file_to_data(file):
batch = {}
speech, _ = torchaudio.load(file)
batch["speech"] = resampler.forward(speech.squeeze(0)).numpy()
batch["sampling_rate"] = resampler.new_freq
return batch
def predict(data):
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
return processor.batch_decode(pred_ids)
```
Predict
```python
predict(load_file_to_data('voice file path'))
```
## Evaluation
The model can be evaluated as follows on the Cantonese (Hong Kong) test data of Common Voice.
CER calculation refer to https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese
```python
!mkdir cer
!wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
!pip install jiwer
import torchaudio
from datasets import load_dataset, load_metric
from transformers import (
Wav2Vec2ForCTC,
Wav2Vec2Processor,
)
import torch
import re
import sys
cer = load_metric("./cer")
model_name = "voidful/wav2vec2-large-xlsr-53-hk"
device = "cuda"
processor_name = "voidful/wav2vec2-large-xlsr-53-hk"
chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\\"#$%&()*+,\\-.\\:;<=>?@\\[\\]\\\\\\/^_`{|}~]"
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2Processor.from_pretrained(processor_name)
ds = load_dataset("common_voice", 'zh-HK', data_dir="./cv-corpus-6.1-2020-12-11", split="test")
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
def map_to_array(batch):
speech, _ = torchaudio.load(batch["path"])
batch["speech"] = resampler.forward(speech.squeeze(0)).numpy()
batch["sampling_rate"] = resampler.new_freq
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")
return batch
ds = ds.map(map_to_array)
def map_to_pred(batch):
features = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["predicted"] = processor.batch_decode(pred_ids)
batch["target"] = batch["sentence"]
return batch
result = ds.map(map_to_pred, batched=True, batch_size=16, remove_columns=list(ds.features.keys()))
print("CER: {:2f}".format(100 * cer.compute(predictions=result["predicted"], references=result["target"])))
```
`CER 16.41`
|