Pengwei Li
commited on
Commit
•
7a51771
1
Parent(s):
24d73d1
Update README.md
Browse files
README.md
CHANGED
@@ -16,13 +16,12 @@ widget:
|
|
16 |
- example_title: Common Voice sample 1
|
17 |
src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
|
18 |
---
|
19 |
-
|
20 |
## xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022
|
21 |
|
22 |
-
|
23 |
- Spanish-English
|
24 |
- Trained on
|
25 |
-
- Speech synthesis with facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
|
26 |
|
27 |
## Usage
|
28 |
```python
|
@@ -35,20 +34,21 @@ from fairseq import hub_utils
|
|
35 |
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
36 |
from fairseq.models.speech_to_text.hub_interface import S2THubInterface
|
37 |
from fairseq.models.text_to_speech import CodeHiFiGANVocoder
|
38 |
-
from fairseq.models.text_to_speech.hub_interface import
|
39 |
-
|
40 |
-
VocoderHubInterface,
|
41 |
-
)
|
42 |
from huggingface_hub import snapshot_download
|
43 |
import torchaudio
|
44 |
|
|
|
45 |
|
46 |
models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
|
47 |
"facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
|
48 |
-
arg_overrides={"config_yaml": "config.yaml"},
|
|
|
49 |
)
|
50 |
-
model = models[0]
|
51 |
-
|
|
|
52 |
|
53 |
|
54 |
# requires 16000Hz mono channel audio
|
@@ -57,9 +57,7 @@ audio, _ = torchaudio.load("/path/to/an/audio/file")
|
|
57 |
sample = S2THubInterface.get_model_input(task, audio)
|
58 |
unit = S2THubInterface.get_prediction(task, model, generator, sample)
|
59 |
|
60 |
-
# speech synthesis
|
61 |
-
cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
|
62 |
-
|
63 |
library_name = "fairseq"
|
64 |
cache_dir = (
|
65 |
cache_dir or (Path.home() / ".cache" / library_name).as_posix()
|
|
|
16 |
- example_title: Common Voice sample 1
|
17 |
src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
|
18 |
---
|
|
|
19 |
## xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022
|
20 |
|
21 |
+
Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
|
22 |
- Spanish-English
|
23 |
- Trained on
|
24 |
+
- Speech synthesis with [facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur](https://huggingface.co/facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur)
|
25 |
|
26 |
## Usage
|
27 |
```python
|
|
|
34 |
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
35 |
from fairseq.models.speech_to_text.hub_interface import S2THubInterface
|
36 |
from fairseq.models.text_to_speech import CodeHiFiGANVocoder
|
37 |
+
from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
|
38 |
+
|
|
|
|
|
39 |
from huggingface_hub import snapshot_download
|
40 |
import torchaudio
|
41 |
|
42 |
+
cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
|
43 |
|
44 |
models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
|
45 |
"facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
|
46 |
+
arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
|
47 |
+
cache_dir=cache_dir,
|
48 |
)
|
49 |
+
model = models[0].cpu()
|
50 |
+
cfg["task"].cpu = True
|
51 |
+
generator = task.build_generator([model], cfg)
|
52 |
|
53 |
|
54 |
# requires 16000Hz mono channel audio
|
|
|
57 |
sample = S2THubInterface.get_model_input(task, audio)
|
58 |
unit = S2THubInterface.get_prediction(task, model, generator, sample)
|
59 |
|
60 |
+
# speech synthesis
|
|
|
|
|
61 |
library_name = "fairseq"
|
62 |
cache_dir = (
|
63 |
cache_dir or (Path.home() / ".cache" / library_name).as_posix()
|