Spaces:
Runtime error
Runtime error
File size: 4,921 Bytes
4f3f83c 425758f 4f3f83c ec1a913 197d161 ec1a913 4f3f83c 425758f 4f3f83c 0f2ccca 8299dc1 4f3f83c 197d161 4f3f83c fe71706 4f3f83c 6769f1b 4f3f83c fe71706 4f3f83c 199ccd6 4f3f83c aabfdd7 4f3f83c 4831744 199ccd6 4f3f83c 199ccd6 4f3f83c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers.models.whisper.tokenization_whisper import LANGUAGES
from transformers.pipelines.audio_utils import ffmpeg_read
model_id = "openai/whisper-large-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
LANGUANGE_MAP = {
0: 'Arabic',
1: 'Basque',
2: 'Breton',
3: 'Catalan',
4: 'Chinese_China',
5: 'Chinese_Hongkong',
6: 'Chinese_Taiwan',
7: 'Chuvash',
8: 'Czech',
9: 'Dhivehi',
10: 'Dutch',
11: 'English',
12: 'Esperanto',
13: 'Estonian',
14: 'French',
15: 'Frisian',
16: 'Georgian',
17: 'German',
18: 'Greek',
19: 'Hakha_Chin',
20: 'Indonesian',
21: 'Interlingua',
22: 'Italian',
23: 'Japanese',
24: 'Kabyle',
25: 'Kinyarwanda',
26: 'Kyrgyz',
27: 'Latvian',
28: 'Maltese',
29: 'Mongolian',
30: 'Persian',
31: 'Polish',
32: 'Portuguese',
33: 'Romanian',
34: 'Romansh_Sursilvan',
35: 'Russian',
36: 'Sakha',
37: 'Slovenian',
38: 'Spanish',
39: 'Swedish',
40: 'Tamil',
41: 'Tatar',
42: 'Turkish',
43: 'Ukranian',
44: 'Welsh'
}
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id)
model.eval()
model.to(device)
bos_token_id = processor.tokenizer.all_special_ids[-106]
decoder_input_ids = torch.tensor([bos_token_id]).to(device)
def detect_language(sentence):
model_ckpt = "barto17/language-detection-fine-tuned-on-xlm-roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenized_sentence = tokenizer(sentence, return_tensors='pt')
output = model(**tokenized_sentence)
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
probability, pred_idx = torch.max(predictions, dim=-1)
language = LANGUANGE_MAP[pred_idx.item()]
return language, probability.item()
def process_audio_file(file, sampling_rate):
with open(file, "rb") as f:
inputs = f.read()
audio = ffmpeg_read(inputs, sampling_rate)
return audio
def transcribe(Microphone, File_Upload):
warn_output = ""
if (Microphone is not None) and (File_Upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = Microphone
elif (Microphone is None) and (File_Upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif Microphone is not None:
file = Microphone
else:
file = File_Upload
sampling_rate = processor.feature_extractor.sampling_rate
audio_data = process_audio_file(file, sampling_rate)
input_features = processor(audio_data, return_tensors="pt").input_features
with torch.no_grad():
logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits
pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(pred_ids[0])
language, probability = detect_language(transcription)
return transcription.capitalize(), language, probability
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
examples = [[f"./{f}"] for f in examples]
outputs=gr.outputs.Label(label="Language detected:")
article = """
Fine-tuned on xlm-roberta-base model.\n
Supported languages:\n
'Arabic', 'Basque', 'Breton', 'Catalan', 'Chinese_China', 'Chinese_Hongkong', 'Chinese_Taiwan', 'Chuvash', 'Czech',
'Dhivehi', 'Dutch', 'English', 'Esperanto', 'Estonian', 'French', 'Frisian', 'Georgian', 'German', 'Greek', 'Hakha_Chin',
'Indonesian', 'Interlingua', 'Italian', 'Japanese', 'Kabyle', 'Kinyarwanda', 'Kyrgyz', 'Latvian', 'Maltese',
'Mangolian', 'Persian', 'Polish', 'Portuguese', 'Romanian', 'Romansh_Sursilvan', 'Russian', 'Sakha', 'Slovenian',
'Spanish', 'Swedish', 'Tamil', 'Tatar', 'Turkish', 'Ukranian', 'Welsh'
"""
gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True),
],
outputs=[
gr.outputs.Textbox(label="Transcription"),
gr.outputs.Textbox(label="Language"),
gr.Number(label="Probability"),
],
verbose=True,
examples = examples,
title="Language Identification from Audio",
description="Detect the Language from Audio.",
article=article,
theme="huggingface"
).launch()
|