File size: 5,552 Bytes
bfb5ccb
1c4ba6c
8ca2e83
 
 
0863f8c
15ae509
3f40220
da952ef
15ae509
 
be67fae
 
8ca2e83
15ae509
3f40220
15ae509
3f40220
 
 
 
 
 
 
15ae509
3f40220
 
 
 
 
1c4ba6c
1ce7124
15ae509
 
b20428c
1ce7124
b20428c
194bfe5
b20428c
15ae509
 
6f0ed3e
a28d64a
 
 
 
 
 
15ae509
6f0ed3e
15ae509
6f0ed3e
bfb5ccb
beedcb4
15ae509
a28d64a
beedcb4
15ae509
 
a28d64a
d1e3f48
a28d64a
6e5c3ef
15ae509
 
 
 
 
 
 
 
 
1ce7124
15ae509
a28d64a
15ae509
a28d64a
 
 
 
 
15ae509
152cf02
15ae509
6723721
 
194bfe5
15ae509
a28d64a
6f0ed3e
6723721
eaed2c2
1c4ba6c
14f36e9
 
1c4ba6c
29c16a4
1c4ba6c
 
 
 
6e35142
1c4ba6c
 
6e35142
1c4ba6c
 
 
 
6f0ed3e
eaed2c2
 
beedcb4
d1e3f48
beedcb4
a28d64a
6f0ed3e
a28d64a
eaed2c2
 
 
 
1ce7124
 
c6001c1
a28d64a
 
6f0ed3e
578d71a
030116b
 
6f0ed3e
eaed2c2
1c4ba6c
a28d64a
6f0ed3e
a28d64a
1c4ba6c
8ca2e83
da952ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import pipeline
from pytube import YouTube
import re
import numpy as np
from scipy.signal import wiener
from io import BytesIO
import noisereduce as nr


pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)

# Define the replacements for Kabardian transcription
replacements = [
    ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
    ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
    ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
    ('щӏ', 'ɕ'), ('я', 'йа')
]

# Reverse replacements for transcription
reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))

def replace_symbols_back(text):
    return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

def preprocess_audio(audio_tensor, original_sample_rate, apply_normalization):
    audio_tensor = audio_tensor.to(dtype=torch.float32)
    audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)  # Convert to mono

    if apply_normalization:
        audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
    # audio_tensor = torch.clamp(audio_tensor, min=-1, max=1)

    audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000)  # Resample
    return audio_tensor

def spectral_gating(audio_tensor):
    audio_data = audio_tensor.numpy()
    reduced_noise = nr.reduce_noise(y=audio_data, sr=16_000)
    return torch.tensor(reduced_noise, dtype=audio_tensor.dtype)

def wiener_filter(audio_tensor):
    audio_data = audio_tensor.numpy()
    filtered_audio = wiener(audio_data)
    return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)

@spaces.GPU
def transcribe_speech(audio, progress=gr.Progress()):
    if audio is None:
        return "No audio received.", None
    progress(0.5, desc="Transcribing audio...")
    audio_np = audio.numpy().squeeze()
    transcription = pipe(audio_np, chunk_length_s=10)['text']
    return replace_symbols_back(transcription), audio

def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
    progress(0, "Downloading YouTube audio...")
    
    yt = YouTube(url)
    stream = yt.streams.filter(only_audio=True).first()
    audio_data = BytesIO()
    stream.stream_to_buffer(audio_data)
    audio_data.seek(0)
    
    try:
        audio, original_sample_rate = torchaudio.load(audio_data)
        audio = preprocess_audio(audio, original_sample_rate, apply_normalization)

        if apply_wiener_filter:
            progress(0.4, "Applying Wiener filter...")
            audio = wiener_filter(audio)

        if apply_spectral_gating:
            progress(0.4, "Applying Spectral Gating filter...")
            audio = spectral_gating(audio)

        transcription, _ = transcribe_speech(audio)

        audio_np = audio.numpy().squeeze()
        sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')

    except Exception as e:
        return str(e), None

    return transcription, "temp_audio.wav" 

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Kabardian Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
              </p>
            </div>
        """
    )

    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        audio_output = gr.Audio(label="Processed Audio")

        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=[transcription_output, audio_output])

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")

        with gr.Accordion("Audio Improvements", open=False):
            apply_normalization = gr.Checkbox(label="Normalize audio volume", value=True)
            apply_spectral_gating = gr.Checkbox(label="Apply Spectral Gating filter", info="Noise reduction", value=True)
            apply_wiener = gr.Checkbox(label="Apply Wiener filter", info="Noise reduction", value=False)

        with gr.Row():
            img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
            title = gr.Label(label="Video Title", scale=2)

        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        audio_output = gr.Audio(label="Processed Audio")

        transcribe_button.click(fn=transcribe_from_youtube, inputs=[youtube_url, apply_wiener, apply_normalization, apply_spectral_gating], outputs=[transcription_output, audio_output])
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()