Commit
·
d7aa717
1
Parent(s):
6750fe1
Update README.md
Browse files
README.md
CHANGED
@@ -90,6 +90,8 @@ print("Prediction:", processor.batch_decode(predicted_ids))
|
|
90 |
print("Reference:", test_data["text"][:2])
|
91 |
```
|
92 |
|
|
|
|
|
93 |
# Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
|
94 |
```python
|
95 |
import torchaudio
|
@@ -101,7 +103,7 @@ import re
|
|
101 |
test = Dataset.from_csv('test.csv')
|
102 |
|
103 |
|
104 |
-
chars_to_ignore_regex = '[
|
105 |
|
106 |
# Preprocessing the datasets.
|
107 |
# We need to read the audio files as arrays
|
@@ -132,6 +134,8 @@ test = test.map(evaluate, batched=True, batch_size=32)
|
|
132 |
print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
|
133 |
```
|
134 |
|
|
|
|
|
135 |
#### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
|
136 |
```python
|
137 |
import torchaudio
|
@@ -141,7 +145,7 @@ import numpy as np
|
|
141 |
import re
|
142 |
from datasets import load_dataset
|
143 |
|
144 |
-
chars_to_ignore_regex = '[
|
145 |
|
146 |
# Preprocessing the datasets.
|
147 |
# We need to read the audio files as arrays
|
@@ -176,4 +180,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=test_data["pred_strings"
|
|
176 |
Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT
|
177 |
|
178 |
WER : 24.944955% (OpenSLR Hindi+Marathi Test set : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
|
|
|
|
|
179 |
WER: 49.303944% (Common Voice Hindi Test Split)
|
|
|
90 |
print("Reference:", test_data["text"][:2])
|
91 |
```
|
92 |
|
93 |
+
|
94 |
+
|
95 |
# Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
|
96 |
```python
|
97 |
import torchaudio
|
|
|
103 |
test = Dataset.from_csv('test.csv')
|
104 |
|
105 |
|
106 |
+
chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
|
107 |
|
108 |
# Preprocessing the datasets.
|
109 |
# We need to read the audio files as arrays
|
|
|
134 |
print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
|
135 |
```
|
136 |
|
137 |
+
|
138 |
+
|
139 |
#### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
|
140 |
```python
|
141 |
import torchaudio
|
|
|
145 |
import re
|
146 |
from datasets import load_dataset
|
147 |
|
148 |
+
chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
|
149 |
|
150 |
# Preprocessing the datasets.
|
151 |
# We need to read the audio files as arrays
|
|
|
180 |
Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT
|
181 |
|
182 |
WER : 24.944955% (OpenSLR Hindi+Marathi Test set : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
|
183 |
+
|
184 |
+
|
185 |
WER: 49.303944% (Common Voice Hindi Test Split)
|