File size: 5,962 Bytes
bfb5ccb
1c4ba6c
8ca2e83
 
 
0863f8c
15ae509
3f40220
da952ef
15ae509
 
be67fae
0ba67b4
be67fae
8ca2e83
15ae509
3f40220
15ae509
3f40220
 
 
 
 
 
 
15ae509
3f40220
 
 
 
 
1c4ba6c
1ce7124
15ae509
 
b20428c
1ce7124
b20428c
 
15ae509
 
6f0ed3e
a28d64a
 
 
 
 
 
15ae509
6f0ed3e
15ae509
6f0ed3e
bfb5ccb
800e3a8
 
a28d64a
7cdf3f3
 
800e3a8
7cdf3f3
 
 
 
15ae509
7cdf3f3
 
800e3a8
d1e3f48
800e3a8
a28d64a
6e5c3ef
15ae509
 
f580f81
 
 
 
 
 
7cdf3f3
 
15ae509
a28d64a
15ae509
7cdf3f3
a28d64a
 
800e3a8
7cdf3f3
15ae509
800e3a8
 
 
 
15ae509
7cdf3f3
6723721
194bfe5
15ae509
a28d64a
6f0ed3e
800e3a8
eaed2c2
1c4ba6c
14f36e9
 
1c4ba6c
29c16a4
1c4ba6c
 
 
 
6e35142
1c4ba6c
 
6e35142
1c4ba6c
 
 
 
6f0ed3e
eaed2c2
 
beedcb4
d1e3f48
beedcb4
6f0ed3e
800e3a8
eaed2c2
 
 
 
1ce7124
 
de5267f
 
a28d64a
6f0ed3e
578d71a
030116b
 
6f0ed3e
eaed2c2
1c4ba6c
a28d64a
6f0ed3e
a28d64a
1c4ba6c
8ca2e83
da952ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import pipeline
from pytube import YouTube
import re
import numpy as np
from scipy.signal import wiener
from io import BytesIO
import noisereduce as nr
import soundfile as sf


pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)

# Define the replacements for Kabardian transcription
replacements = [
    ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
    ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
    ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
    ('щӏ', 'ɕ'), ('я', 'йа')
]

# Reverse replacements for transcription
reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))

def replace_symbols_back(text):
    return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

def preprocess_audio(audio_tensor, original_sample_rate, apply_normalization):
    audio_tensor = audio_tensor.to(dtype=torch.float32)
    audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)  # Convert to mono

    if apply_normalization:
        audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize

    audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000)  # Resample
    return audio_tensor

def spectral_gating(audio_tensor):
    audio_data = audio_tensor.numpy()
    reduced_noise = nr.reduce_noise(y=audio_data, sr=16_000)
    return torch.tensor(reduced_noise, dtype=audio_tensor.dtype)

def wiener_filter(audio_tensor):
    audio_data = audio_tensor.numpy()
    filtered_audio = wiener(audio_data)
    return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)

@spaces.GPU
def transcribe_speech(audio_path, progress=gr.Progress()):
    if audio_path is None:
        return "No audio received.", None

    progress(0.1, desc="Preprocessing audio...")
    audio_tensor, original_sample_rate = torchaudio.load(audio_path)
    audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)

    progress(0.7, desc="Transcribing audio...")
    audio_np = audio_tensor.numpy().squeeze()
    transcription = pipe(audio_np, chunk_length_s=10)['text']
    transcription = replace_symbols_back(transcription)

    return transcription

@spaces.GPU
def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
    progress(0, "Downloading YouTube audio...")
    
    try:
        yt = YouTube(url)
        stream = yt.streams.filter(only_audio=True).first()
        audio_data = BytesIO()
        stream.stream_to_buffer(audio_data)
        audio_data.seek(0)
    
        audio_tensor, original_sample_rate = torchaudio.load(audio_data)
        audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)

        if apply_wiener_filter:
            progress(0.4, "Applying Wiener filter...")
            audio_tensor = wiener_filter(audio_tensor)

        if apply_spectral_gating:
            progress(0.6, "Applying Spectral Gating filter...")
            audio_tensor = spectral_gating(audio_tensor)

        progress(0.8, "Transcribing audio...")
        audio_np = audio_tensor.numpy().squeeze()
        transcription = pipe(audio_np, chunk_length_s=10)['text']
        transcription = replace_symbols_back(transcription)

        audio_np = audio_tensor.numpy().squeeze()
        sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')

    except Exception as e:
        return str(e), None

    return transcription, "temp_audio.wav"

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Kabardian Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
              </p>
            </div>
        """
    )

    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")

        transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output])

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")

        with gr.Accordion("Audio Improvements", open=False):
            apply_normalization = gr.Checkbox(label="Normalize audio volume", value=False)
            apply_spectral_gating = gr.Checkbox(label="Apply Spectral Gating filter", info="Noise reduction", value=False)
            apply_wiener = gr.Checkbox(label="Apply Wiener filter", info="Noise reduction", value=False)

        with gr.Row():
            img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
            title = gr.Label(label="Video Title", scale=2)

        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        audio_output = gr.Audio(label="Processed Audio")

        transcribe_button.click(fn=transcribe_from_youtube, inputs=[youtube_url, apply_wiener, apply_normalization, apply_spectral_gating], outputs=[transcription_output, audio_output])
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()