Spaces:
Runtime error
Runtime error
File size: 6,670 Bytes
4f3f83c a12d0b2 425758f 4f3f83c 9d00207 88575d6 ec1a913 197d161 ec1a913 4f3f83c 425758f 4f3f83c 0c25646 0f2ccca 0c25646 4442b41 0c25646 0f2ccca 0c25646 4442b41 a12d0b2 9a2a1d6 a12d0b2 0c25646 1e59118 8299dc1 0c25646 4f3f83c 197d161 4f3f83c 0c25646 4f3f83c fe71706 4f3f83c b2dca93 4f3f83c 6769f1b 4f3f83c fe71706 4f3f83c 199ccd6 0c25646 4f3f83c aabfdd7 4f3f83c 36a6f79 4f3f83c 4831744 199ccd6 4f3f83c 199ccd6 4f3f83c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import torch
from pydub import AudioSegment
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
from pytube import YouTube
#from transformers import WhisperForConditionalGeneration, WhisperProcessor
#from transformers.models.whisper.tokenization_whisper import LANGUAGES
#from transformers.pipelines.audio_utils import ffmpeg_read
model_id = "openai/whisper-large-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
LANGUANGE_MAP = {
0: 'Arabic',
1: 'Basque',
2: 'Breton',
3: 'Catalan',
4: 'Chinese_China',
5: 'Chinese_Hongkong',
6: 'Chinese_Taiwan',
7: 'Chuvash',
8: 'Czech',
9: 'Dhivehi',
10: 'Dutch',
11: 'English',
12: 'Esperanto',
13: 'Estonian',
14: 'French',
15: 'Frisian',
16: 'Georgian',
17: 'German',
18: 'Greek',
19: 'Hakha_Chin',
20: 'Indonesian',
21: 'Interlingua',
22: 'Italian',
23: 'Japanese',
24: 'Kabyle',
25: 'Kinyarwanda',
26: 'Kyrgyz',
27: 'Latvian',
28: 'Maltese',
29: 'Mongolian',
30: 'Persian',
31: 'Polish',
32: 'Portuguese',
33: 'Romanian',
34: 'Romansh_Sursilvan',
35: 'Russian',
36: 'Sakha',
37: 'Slovenian',
38: 'Spanish',
39: 'Swedish',
40: 'Tamil',
41: 'Tatar',
42: 'Turkish',
43: 'Ukranian',
44: 'Welsh'
}
import whisper
# define function for transcription
def transcribe(Microphone, File_Upload, URL):
warn_output = ""
if (Microphone is not None) and (File_Upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = Microphone
elif Microphone is not None:
file = Microphone
#elif URL:
# link = YouTube(URL)
# file = link.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
elif URL:
link = YouTube(URL)
stream = link.streams.filter(only_audio=True).first()
# Download the audio file with a temporary filename
temp_filename = "temp_audio_file"
stream.download(filename=temp_filename)
# Load the downloaded file with pydub and convert it to mp3
audio = AudioSegment.from_file(temp_filename, format="mp4")
# Truncate it to the first 30 seconds
truncated_audio = audio[:30000] # AudioSegment works in milliseconds
file = "file.mp3"
truncated_audio.export(file, format="mp3")
else:
file = File_Upload
language = None
options = whisper.DecodingOptions(without_timestamps=True)
loaded_model = whisper.load_model("base")
transcript = loaded_model.transcribe(file, language=language)
return detect_language(transcript["text"])
def detect_language(sentence):
model_ckpt = "barto17/language-detection-fine-tuned-on-xlm-roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenized_sentence = tokenizer(sentence, return_tensors='pt')
output = model(**tokenized_sentence)
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
probability, pred_idx = torch.max(predictions, dim=-1)
language = LANGUANGE_MAP[pred_idx.item()]
return sentence, language, probability.item()
"""
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id)
model.eval()
model.to(device)
bos_token_id = processor.tokenizer.all_special_ids[-106]
decoder_input_ids = torch.tensor([bos_token_id]).to(device)
def process_audio_file(file, sampling_rate):
with open(file, "rb") as f:
inputs = f.read()
audio = ffmpeg_read(inputs, sampling_rate)
print(audio)
return audio
def transcribe(Microphone, File_Upload):
warn_output = ""
if (Microphone is not None) and (File_Upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = Microphone
elif (Microphone is None) and (File_Upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif Microphone is not None:
file = Microphone
else:
file = File_Upload
sampling_rate = processor.feature_extractor.sampling_rate
audio_data = process_audio_file(file, sampling_rate)
input_features = processor(audio_data, return_tensors="pt").input_features
with torch.no_grad():
logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits
pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(pred_ids[0])
language, probability = detect_language(transcription)
return transcription.capitalize(), language, probability
"""
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
examples = [[f"./{f}"] for f in examples]
outputs=gr.outputs.Label(label="Language detected:")
article = """
Fine-tuned on xlm-roberta-base model.\n
Supported languages:\n
'Arabic', 'Basque', 'Breton', 'Catalan', 'Chinese_China', 'Chinese_Hongkong', 'Chinese_Taiwan', 'Chuvash', 'Czech',
'Dhivehi', 'Dutch', 'English', 'Esperanto', 'Estonian', 'French', 'Frisian', 'Georgian', 'German', 'Greek', 'Hakha_Chin',
'Indonesian', 'Interlingua', 'Italian', 'Japanese', 'Kabyle', 'Kinyarwanda', 'Kyrgyz', 'Latvian', 'Maltese',
'Mangolian', 'Persian', 'Polish', 'Portuguese', 'Romanian', 'Romansh_Sursilvan', 'Russian', 'Sakha', 'Slovenian',
'Spanish', 'Swedish', 'Tamil', 'Tatar', 'Turkish', 'Ukranian', 'Welsh'
"""
gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True),
gr.Textbox(label="Paste YouTube link here [For computation purposes: only first 30 seconds are evaluated]"),
],
outputs=[
gr.outputs.Textbox(label="Transcription"),
gr.outputs.Textbox(label="Language"),
gr.Number(label="Probability"),
],
verbose=True,
examples = examples,
title="Language Identification from Audio",
description="Detect the Language from Audio.",
article=article,
theme="huggingface"
).launch()
|