mizoru commited on
Commit
1732531
·
1 Parent(s): dbc58cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -41
app.py CHANGED
@@ -5,59 +5,89 @@ MODELS = {
5
  "Tatar": {"model_id": "sammy786/wav2vec2-xlsr-tatar", "has_lm": False},
6
  "Chuvash": {"model_id": "sammy786/wav2vec2-xlsr-chuvash", "has_lm": False},
7
  "Bashkir": {"model_id": "AigizK/wav2vec2-large-xls-r-300m-bashkir-cv7_opt", "has_lm": True},
8
- "Erzya": {"model_id": "DrishtiSharma/wav2vec2-large-xls-r-300m-myv-v1", "has_lm": True}
9
  }
10
 
11
  CACHED_MODELS_BY_ID = {}
12
 
13
- LANGUAGES = list(MODELS.keys())
 
 
 
 
 
 
 
 
 
14
 
15
- def run(input_file, language, decoding_type):
16
-
17
- #logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
18
 
 
 
19
  model = MODELS.get(language, None)
20
-
21
 
22
- if decoding_type == "LM" and not model["has_lm"]:
23
- history.append({
24
- "error_message": f"LM not available for {language} language :("
25
- })
26
- else:
27
 
28
- # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
29
- model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
30
- if model_instance is None:
31
- model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
32
- CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
 
 
 
33
 
34
- if decoding_type == "LM":
35
- processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
36
- asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
37
- feature_extractor=processor.feature_extractor, decoder=processor.decoder)
38
- else:
39
- processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
40
- asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
41
- feature_extractor=processor.feature_extractor, decoder=None)
42
 
43
- transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
 
 
 
 
 
 
44
 
 
45
 
 
 
 
 
 
 
 
46
 
47
- return transcription
 
 
 
 
 
48
 
49
- gr.Interface(
50
- run,
51
- inputs=[
52
- gr.Audio(source="microphone", type="filepath", label="Record something..."),
53
- gr.Radio(label="Language", choices=LANGUAGES),
54
- gr.Radio(label="Decoding type", choices=["greedy", "LM"])
55
- # gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
56
- ],
57
- outputs=[
58
- gr.Textbox()
59
- ],
60
- allow_screenshot=False,
61
- allow_flagging="never",
62
- theme="grass"
63
- ).launch(enable_queue=True)
 
 
 
 
 
 
 
 
 
 
5
  "Tatar": {"model_id": "sammy786/wav2vec2-xlsr-tatar", "has_lm": False},
6
  "Chuvash": {"model_id": "sammy786/wav2vec2-xlsr-chuvash", "has_lm": False},
7
  "Bashkir": {"model_id": "AigizK/wav2vec2-large-xls-r-300m-bashkir-cv7_opt", "has_lm": True},
8
+ "Erzya": {"model_id": "DrishtiSharma/wav2vec2-large-xls-r-300m-myv-v1", "has_lm": False}
9
  }
10
 
11
  CACHED_MODELS_BY_ID = {}
12
 
13
+ LANGUAGES_ENG = list(MODELS.keys())
14
+ LANGUAGES_RUS = ["Татарский", "Чувашский", "Башкирский", "Эрзянский"]
15
+ RUS2ENG = {k:v for k,v in zip(LANGUAGES_RUS, LANGUAGES_ENG)}
16
+ LANG2YDX = {"Tatar": 'tt',
17
+ "Chuvash": "ba",
18
+ "Bashkir": "cv",
19
+ "Erzya": None,
20
+ "English": 'en',
21
+ 'Русский': 'ru'
22
+ }
23
 
 
 
 
24
 
25
+ def run(input_file, language, decoding_type, lang):
26
+ language = RUS2ENG.get(language, language)
27
  model = MODELS.get(language, None)
 
28
 
29
+ model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
30
+ if model_instance is None:
31
+ model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
32
+ CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
 
33
 
34
+ if decoding_type == "LM":
35
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
36
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
37
+ feature_extractor=processor.feature_extractor, decoder=processor.decoder)
38
+ else:
39
+ processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
40
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
41
+ feature_extractor=processor.feature_extractor, decoder=None)
42
 
43
+ transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
 
 
 
 
 
 
 
44
 
45
+ if LANG2YDX[language]:
46
+ url = 'https://translate.yandex.ru/?lang=' + LANG2YDX[language] + '-' + LANG2YDX[lang] + '&text=' + transcription # ru-fr&text=
47
+ if lang == "Русский":
48
+ label = 'Посмотреть перевод'
49
+ else: label = 'Check the translation'
50
+ html = f'<a href="{url}" target="_blank">{label}</a>'
51
+ else: html = None
52
 
53
+ return transcription, html
54
 
55
+
56
+ def update_decoding(language):
57
+ language = RUS2ENG.get(language, language)
58
+ if MODELS[language]['has_lm']:
59
+ return gr.Radio.update(visible=True)
60
+ else: return gr.Radio.update(visible=False, value='Greedy')
61
+
62
 
63
+ def update_interface(lang):
64
+ if lang == 'Русский':
65
+ languages = gr.Radio.update(label='Язык записи', choices=LANGUAGES_RUS)
66
+ audio = gr.Audio.update(label='Скажите что-нибудь...')
67
+ # btn = gr.Button.update(value='Расшифровать')
68
+ decoding = gr.Radio.update(label='Тип декодирования')
69
 
70
+ elif lang == 'English':
71
+ languages = gr.Radio.update(label='Language', choices=LANGUAGES_ENG)
72
+ audio = gr.Audio.update(label='Say something...')
73
+ # btn = gr.Button.update(value='Transcribe')
74
+ decoding = gr.Radio.update(label='Decoding type')
75
+
76
+ return languages, audio, decoding
77
+
78
+ with gr.Blocks() as blocks:
79
+ lang = gr.Radio(label="Выберите язык интерфейса / Interface language", choices=['Русский','English'])
80
+ languages = gr.Radio(label="Language", choices=LANGUAGES_RUS)
81
+ audio = gr.Audio(source="microphone", type="filepath", label="Скажите что-нибудь...")
82
+ decoding = gr.Radio(label="Тип декодирования", choices=["Greedy", "LM"], visible=False, type='index')
83
+ btn = gr.Button('Расшифровать / Transcribe')
84
+ output = gr.Textbox(show_label=False)
85
+
86
+ translation = gr.HTML()
87
+
88
+ languages.change(fn=update_decoding, inputs=[languages], outputs=[decoding])
89
+ lang.change(fn=update_interface, inputs=[lang], outputs=[languages, audio, decoding])
90
+ btn.click(fn=run, inputs=[audio, languages, decoding, lang], outputs=[output, translation])
91
+
92
+
93
+ blocks.launch(enable_queue=True, debug=True)