File size: 3,908 Bytes
65191fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d2ace
 
65191fa
d8d2ace
 
 
65191fa
d8d2ace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65191fa
 
d8d2ace
 
65191fa
d8d2ace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65191fa
d8d2ace
 
65191fa
d8d2ace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65191fa
d8d2ace
65191fa
 
ef7af0d
d8d2ace
 
 
 
 
 
 
 
 
 
 
 
65191fa
1b3a760
65191fa
d8d2ace
65191fa
 
95ff189
d8d2ace
65191fa
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import time
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text

# tagen = 'kan-bayashi/ljspeech_vits' 
# vocoder_tagen = "none" 

speech2text = Speech2Text.from_pretrained(
    asr_train_config="slurp/config.yaml",
    asr_model_file="slurp/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    nbest=1
)
# Confirm the sampling rate is equal to that of the training corpus.
# If not, you need to resample the audio data before inputting to speech2text
# speech, rate = soundfile.read("audio--1504190171-headset.flac")
# nbests = speech2text(speech)

# text, *_ = nbests[0]
# print(text)
# exit()

# text2speechen = Text2Speech.from_pretrained(
#     model_tag=str_or_none(tagen),
#     vocoder_tag=str_or_none(vocoder_tagen),
#     device="cpu",
#     # Only for Tacotron 2 & Transformer
#     threshold=0.5,
#     # Only for Tacotron 2
#     minlenratio=0.0,
#     maxlenratio=10.0,
#     use_att_constraint=False,
#     backward_window=1,
#     forward_window=3,
#     # Only for FastSpeech & FastSpeech2 & VITS
#     speed_control_alpha=1.0,
#     # Only for VITS
#     noise_scale=0.333,
#     noise_scale_dur=0.333,
# )


# tagjp = 'kan-bayashi/jsut_full_band_vits_prosody' 
# vocoder_tagjp = 'none'

# text2speechjp = Text2Speech.from_pretrained(
#     model_tag=str_or_none(tagjp),
#     vocoder_tag=str_or_none(vocoder_tagjp),
#     device="cpu",
#     # Only for Tacotron 2 & Transformer
#     threshold=0.5,
#     # Only for Tacotron 2
#     minlenratio=0.0,
#     maxlenratio=10.0,
#     use_att_constraint=False,
#     backward_window=1,
#     forward_window=3,
#     # Only for FastSpeech & FastSpeech2 & VITS
#     speed_control_alpha=1.0,
#     # Only for VITS
#     noise_scale=0.333,
#     noise_scale_dur=0.333,
# )

# tagch = 'kan-bayashi/csmsc_full_band_vits'
# vocoder_tagch = "none" 

# text2speechch = Text2Speech.from_pretrained(
#     model_tag=str_or_none(tagch),
#     vocoder_tag=str_or_none(vocoder_tagch),
#     device="cpu",
#     # Only for Tacotron 2 & Transformer
#     threshold=0.5,
#     # Only for Tacotron 2
#     minlenratio=0.0,
#     maxlenratio=10.0,
#     use_att_constraint=False,
#     backward_window=1,
#     forward_window=3,
#     # Only for FastSpeech & FastSpeech2 & VITS
#     speed_control_alpha=1.0,
#     # Only for VITS
#     noise_scale=0.333,
#     noise_scale_dur=0.333,
# )

def inference(wav,lang):
  with torch.no_grad():
      if lang == "english":
          speech, rate = soundfile.read(wav.name)
          nbests = speech2text(speech)
          text, *_ = nbests[0]
      # if lang == "chinese":
      #     wav = text2speechch(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
      # if lang == "japanese":
      #     wav = text2speechjp(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
  return  text
title = "ESPnet2-SLU"
description = "Gradio demo for ESPnet2-SLU: Extending the Edge of SLU Research. To use it, simply record your audio. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"

examples=[['audio_slurp.flac',"english"]]

# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
    inference, 
    [gr.inputs.Audio(label="input audio", type="file"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")], 
    gr.outputs.Textbox(type="str", label="Output"),
    title=title,
    description=description,
    article=article,
    enable_queue=True,
    examples=examples
    ).launch(debug=True)