Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -3,18 +3,16 @@ import os
|
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import torchaudio
|
6 |
-
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
|
7 |
-
from pytube import YouTube
|
8 |
from transformers import pipeline
|
|
|
9 |
import re
|
10 |
-
from pydub import AudioSegment
|
11 |
-
from scipy.io import wavfile
|
12 |
-
from scipy.signal import wiener
|
13 |
import numpy as np
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer
|
17 |
|
|
|
18 |
replacements = [
|
19 |
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
|
20 |
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
|
@@ -22,55 +20,55 @@ replacements = [
|
|
22 |
('щӏ', 'ɕ'), ('я', 'йа')
|
23 |
]
|
24 |
|
|
|
25 |
reverse_replacements = {v: k for k, v in replacements}
|
26 |
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
|
27 |
|
28 |
def replace_symbols_back(text):
|
29 |
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
|
30 |
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
35 |
|
36 |
-
def apply_wiener_filter(
|
37 |
-
|
38 |
filtered_audio = wiener(audio_data)
|
39 |
-
|
40 |
-
|
41 |
-
def resample_audio(audio_path, target_sample_rate=16000):
|
42 |
-
audio, sample_rate = torchaudio.load(audio_path)
|
43 |
-
resampled_audio = torchaudio.transforms.Resample(sample_rate, target_sample_rate)(audio)
|
44 |
-
torchaudio.save(audio_path, resampled_audio, target_sample_rate)
|
45 |
|
46 |
@spaces.GPU
|
47 |
def transcribe_speech(audio, progress=gr.Progress()):
|
48 |
-
if audio is None:
|
49 |
return "No audio received."
|
50 |
-
|
51 |
progress(0.5, desc="Transcribing audio...")
|
52 |
-
|
53 |
-
|
54 |
return replace_symbols_back(transcription)
|
55 |
|
56 |
def transcribe_from_youtube(url, apply_improvements, progress=gr.Progress()):
|
57 |
progress(0, "Downloading YouTube audio...")
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
74 |
|
75 |
return transcription
|
76 |
|
|
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import torchaudio
|
|
|
|
|
6 |
from transformers import pipeline
|
7 |
+
from pytube import YouTube
|
8 |
import re
|
|
|
|
|
|
|
9 |
import numpy as np
|
10 |
+
from scipy.signal import wiener
|
11 |
+
from io import BytesIO
|
12 |
|
13 |
+
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
|
|
|
14 |
|
15 |
+
# Define the replacements for Kabardian transcription
|
16 |
replacements = [
|
17 |
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
|
18 |
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
|
|
|
20 |
('щӏ', 'ɕ'), ('я', 'йа')
|
21 |
]
|
22 |
|
23 |
+
# Reverse replacements for transcription
|
24 |
reverse_replacements = {v: k for k, v in replacements}
|
25 |
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
|
26 |
|
27 |
def replace_symbols_back(text):
|
28 |
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
|
29 |
|
30 |
+
def preprocess_audio(audio_tensor, original_sample_rate):
|
31 |
+
audio_tensor = audio_tensor.to(dtype=torch.float32)
|
32 |
+
audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True) # Convert to mono
|
33 |
+
audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
|
34 |
+
audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
|
35 |
+
return audio_tensor
|
36 |
|
37 |
+
def apply_wiener_filter(audio_tensor):
|
38 |
+
audio_data = audio_tensor.numpy()
|
39 |
filtered_audio = wiener(audio_data)
|
40 |
+
return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
@spaces.GPU
|
43 |
def transcribe_speech(audio, progress=gr.Progress()):
|
44 |
+
if audio is None:
|
45 |
return "No audio received."
|
|
|
46 |
progress(0.5, desc="Transcribing audio...")
|
47 |
+
audio_np = audio.numpy().squeeze()
|
48 |
+
transcription = pipe(audio_np, chunk_length_s=10)['text']
|
49 |
return replace_symbols_back(transcription)
|
50 |
|
51 |
def transcribe_from_youtube(url, apply_improvements, progress=gr.Progress()):
|
52 |
progress(0, "Downloading YouTube audio...")
|
53 |
+
|
54 |
+
yt = YouTube(url)
|
55 |
+
stream = yt.streams.filter(only_audio=True).first()
|
56 |
+
audio_data = BytesIO()
|
57 |
+
stream.stream_to_buffer(audio_data)
|
58 |
+
audio_data.seek(0)
|
59 |
+
|
60 |
+
try:
|
61 |
+
audio, original_sample_rate = torchaudio.load(audio_data)
|
62 |
+
audio = preprocess_audio(audio, original_sample_rate)
|
63 |
+
|
64 |
+
if apply_improvements:
|
65 |
+
progress(0.4, "Applying Wiener filter...")
|
66 |
+
audio = apply_wiener_filter(audio)
|
67 |
+
|
68 |
+
transcription = transcribe_speech(audio)
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
return str(e)
|
72 |
|
73 |
return transcription
|
74 |
|