tan-z-tan commited on
Commit
3fbd296
1 Parent(s): 8b95d52
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +127 -0
  3. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *~
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ import torchaudio
6
+ import time
7
+
8
+ from transformers import pipeline
9
+ # from speechbrain.inference.VAD import VAD
10
+ from speechbrain.inference.classifiers import EncoderClassifier
11
+
12
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
13
+ # VAD = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir="pretrained_models/vad-crdnn-libriparty")
14
+ language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
15
+
16
+ data = []
17
+ current_chunk = []
18
+ index_to_lang = {
19
+ 0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
20
+ 5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
21
+ 10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
22
+ 15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
23
+ 20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
24
+ 25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
25
+ 30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
26
+ 35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
27
+ 40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
28
+ 45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
29
+ 50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
30
+ 55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
31
+ 60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
32
+ 65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
33
+ 70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
34
+ 75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
35
+ 80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
36
+ 85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
37
+ 90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
38
+ 95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
39
+ 100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
40
+ 105: 'Yoruba', 106: 'Chinese'
41
+ }
42
+ lang_index_JA_EN = {
43
+ 'ja': 45,
44
+ 'en': 20,
45
+ }
46
+
47
+ def resample_audio(audio, orig_sr, target_sr=16000):
48
+ if orig_sr != target_sr:
49
+ print(f"Resampling audio from {orig_sr} to {target_sr}")
50
+ audio = audio.astype(np.float32)
51
+ resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
52
+ audio = resampler(torch.from_numpy(audio).unsqueeze(0)).squeeze(0).numpy()
53
+ return audio
54
+
55
+
56
+ SAMPLING_RATE = 16000
57
+ CHUNK_DURATION = 5 # 5秒ごとのチャンク
58
+
59
+ def process_audio(audio):
60
+ global data, current_chunk
61
+ print("Process_audio")
62
+ print(audio)
63
+ sr, audio_data = audio
64
+
65
+ print(audio_data.shape)
66
+ # 一番最初にSampling rateを揃えておく
67
+ audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
68
+ audio_sec = 0
69
+
70
+ # 新しいデータを現在のチャンクに追加
71
+ current_chunk.append(audio_data)
72
+ total_chunk = np.concatenate(current_chunk)
73
+
74
+ while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION:
75
+ chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION]
76
+ total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:] # 処理済みの部分を削除
77
+ audio_sec += CHUNK_DURATION
78
+
79
+ print(f"Processing audio chunk of length {len(chunk)}")
80
+ volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
81
+ length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
82
+ lang_guess = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
83
+
84
+ # 日本語と英語の確率値を取得
85
+ ja_prob = lang_guess[0][0][lang_index_JA_EN['ja']].item()
86
+ en_prob = lang_guess[0][0][lang_index_JA_EN['en']].item()
87
+ ja_en = 'ja' if ja_prob > en_prob else 'en'
88
+
89
+ # Top 3言語を取得
90
+ top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
91
+ top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
92
+
93
+ # transcript
94
+ transcript = transcriber(chunk)
95
+ print(transcript)
96
+
97
+ data.append({
98
+ # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
99
+ "Time": audio_sec,
100
+ "Length (s)": length,
101
+ "Volume": volume_norm,
102
+ "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
103
+ "Language": top3_languages,
104
+ "Text": transcript['text'],
105
+ })
106
+
107
+ df = pd.DataFrame(data)
108
+ yield (SAMPLING_RATE, chunk), df
109
+
110
+ # 未処理の残りのデータを保持
111
+ current_chunk = [total_chunk]
112
+
113
+ # inputs = gr.Audio(sources=["microphone", "upload"], type="numpy", streaming=True)
114
+ inputs = gr.Audio(sources=["microphone", "upload"], type="numpy")
115
+ outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
116
+
117
+ demo = gr.Interface(
118
+ fn=process_audio,
119
+ inputs=inputs,
120
+ outputs=outputs,
121
+ live=True,
122
+ title="Real-time Audio Processing",
123
+ description="Speak into the microphone and see real-time audio processing results."
124
+ )
125
+
126
+ demo.launch()
127
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ sounddevice
3
+ numpy
4
+ pandas
5
+ speechbrain