anzorq commited on
Commit
15ae509
1 Parent(s): da952ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -39
app.py CHANGED
@@ -3,18 +3,16 @@ import os
3
  import gradio as gr
4
  import torch
5
  import torchaudio
6
- from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
7
- from pytube import YouTube
8
  from transformers import pipeline
 
9
  import re
10
- from pydub import AudioSegment
11
- from scipy.io import wavfile
12
- from scipy.signal import wiener
13
  import numpy as np
 
 
14
 
15
- # pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
16
- pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer
17
 
 
18
  replacements = [
19
  ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
20
  ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
@@ -22,55 +20,55 @@ replacements = [
22
  ('щӏ', 'ɕ'), ('я', 'йа')
23
  ]
24
 
 
25
  reverse_replacements = {v: k for k, v in replacements}
26
  reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
27
 
28
  def replace_symbols_back(text):
29
  return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
30
 
31
- def normalize_audio(audio_path):
32
- audio = AudioSegment.from_file(audio_path, format="mp4")
33
- normalized_audio = audio.normalize()
34
- normalized_audio.export(audio_path, format="mp4")
 
 
35
 
36
- def apply_wiener_filter(audio_path):
37
- sample_rate, audio_data = wavfile.read(audio_path)
38
  filtered_audio = wiener(audio_data)
39
- wavfile.write(audio_path, sample_rate, filtered_audio.astype(np.int16))
40
-
41
- def resample_audio(audio_path, target_sample_rate=16000):
42
- audio, sample_rate = torchaudio.load(audio_path)
43
- resampled_audio = torchaudio.transforms.Resample(sample_rate, target_sample_rate)(audio)
44
- torchaudio.save(audio_path, resampled_audio, target_sample_rate)
45
 
46
  @spaces.GPU
47
  def transcribe_speech(audio, progress=gr.Progress()):
48
- if audio is None: # Handle the NoneType error for microphone input
49
  return "No audio received."
50
-
51
  progress(0.5, desc="Transcribing audio...")
52
- transcription = pipe(audio, chunk_length_s=10)['text']
53
-
54
  return replace_symbols_back(transcription)
55
 
56
  def transcribe_from_youtube(url, apply_improvements, progress=gr.Progress()):
57
  progress(0, "Downloading YouTube audio...")
58
- audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
59
-
60
- if apply_improvements:
61
- progress(0.2, "Normalizing audio...")
62
- normalize_audio(audio_path)
63
-
64
- progress(0.4, "Applying Wiener filter...")
65
- apply_wiener_filter(audio_path)
66
-
67
- progress(0.6, "Resampling audio...")
68
- resample_audio(audio_path)
69
-
70
- progress(0.8, "Transcribing audio...")
71
- transcription = transcribe_speech(audio_path)
72
-
73
- os.remove(audio_path)
 
 
 
74
 
75
  return transcription
76
 
 
3
  import gradio as gr
4
  import torch
5
  import torchaudio
 
 
6
  from transformers import pipeline
7
+ from pytube import YouTube
8
  import re
 
 
 
9
  import numpy as np
10
+ from scipy.signal import wiener
11
+ from io import BytesIO
12
 
13
+ pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
 
14
 
15
+ # Define the replacements for Kabardian transcription
16
  replacements = [
17
  ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
18
  ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
 
20
  ('щӏ', 'ɕ'), ('я', 'йа')
21
  ]
22
 
23
+ # Reverse replacements for transcription
24
  reverse_replacements = {v: k for k, v in replacements}
25
  reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
26
 
27
  def replace_symbols_back(text):
28
  return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
29
 
30
+ def preprocess_audio(audio_tensor, original_sample_rate):
31
+ audio_tensor = audio_tensor.to(dtype=torch.float32)
32
+ audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True) # Convert to mono
33
+ audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
34
+ audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
35
+ return audio_tensor
36
 
37
+ def apply_wiener_filter(audio_tensor):
38
+ audio_data = audio_tensor.numpy()
39
  filtered_audio = wiener(audio_data)
40
+ return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
 
 
 
 
 
41
 
42
  @spaces.GPU
43
  def transcribe_speech(audio, progress=gr.Progress()):
44
+ if audio is None:
45
  return "No audio received."
 
46
  progress(0.5, desc="Transcribing audio...")
47
+ audio_np = audio.numpy().squeeze()
48
+ transcription = pipe(audio_np, chunk_length_s=10)['text']
49
  return replace_symbols_back(transcription)
50
 
51
  def transcribe_from_youtube(url, apply_improvements, progress=gr.Progress()):
52
  progress(0, "Downloading YouTube audio...")
53
+
54
+ yt = YouTube(url)
55
+ stream = yt.streams.filter(only_audio=True).first()
56
+ audio_data = BytesIO()
57
+ stream.stream_to_buffer(audio_data)
58
+ audio_data.seek(0)
59
+
60
+ try:
61
+ audio, original_sample_rate = torchaudio.load(audio_data)
62
+ audio = preprocess_audio(audio, original_sample_rate)
63
+
64
+ if apply_improvements:
65
+ progress(0.4, "Applying Wiener filter...")
66
+ audio = apply_wiener_filter(audio)
67
+
68
+ transcription = transcribe_speech(audio)
69
+
70
+ except Exception as e:
71
+ return str(e)
72
 
73
  return transcription
74