tan-z-tan commited on
Commit
3d444ab
1 Parent(s): 1ecc4f1
Files changed (2) hide show
  1. app.py +82 -35
  2. whisper.py +5 -2
app.py CHANGED
@@ -9,6 +9,7 @@ from whisper import transcribe
9
 
10
  # アプリケーションの状態を保持する変数
11
  data = []
 
12
  current_chunk = []
13
 
14
  SAMPLING_RATE = 16000
@@ -30,8 +31,83 @@ def resample_audio(audio, orig_sr, target_sr=16000):
30
  return audio
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def process_audio(audio, chunk_duration, language_set):
34
- global data, current_chunk, SAMPLING_RATE
 
 
 
 
35
  print("Process_audio")
36
  print(audio)
37
  if audio is None:
@@ -60,39 +136,10 @@ def process_audio(audio, chunk_duration, language_set):
60
  audio_sec += chunk_duration
61
 
62
  print(f"Processing audio chunk of length {len(chunk)}")
63
- volume_norm = np.linalg.norm(chunk)
64
- length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
65
- s = datetime.now()
66
- selected_scores, all_scores = identify_languages(chunk, language_set)
67
- lang_id_time = (datetime.now() - s).total_seconds()
68
-
69
- # 日本語と英語の確率値を取得
70
- ja_prob = selected_scores['Japanese']
71
- en_prob = selected_scores['English']
72
-
73
- ja_en = 'ja' if ja_prob > en_prob else 'en'
74
-
75
- # Top 3言語を取得
76
- top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
77
-
78
- # テキストの認識
79
- s = datetime.now()
80
- transcription = transcribe(chunk)
81
- transcribe_time = (datetime.now() - s).total_seconds()
82
-
83
- data.append({
84
- "Time": audio_sec,
85
- "Length (s)": length,
86
- "Volume": volume_norm,
87
- "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
88
- "Language": top3_languages,
89
- "Lang ID Time": lang_id_time,
90
- "Transcribe Time": transcribe_time,
91
- "Text": transcription,
92
- })
93
-
94
- df = pd.DataFrame(data)
95
- yield (SAMPLING_RATE, chunk), df
96
 
97
  # 未処理の残りのデータを保持
98
  current_chunk = [total_chunk]
@@ -119,7 +166,7 @@ with gr.Blocks() as demo:
119
 
120
  with gr.TabItem("Microphone"):
121
  gr.Interface(
122
- fn=process_audio,
123
  inputs=inputs_stream,
124
  outputs=outputs,
125
  live=True,
 
9
 
10
  # アプリケーションの状態を保持する変数
11
  data = []
12
+ data_df = pd.DataFrame()
13
  current_chunk = []
14
 
15
  SAMPLING_RATE = 16000
 
31
  return audio
32
 
33
 
34
+ def process_chunk(chunk, language_set) -> pd.DataFrame:
35
+ print(f"Processing audio chunk of length {len(chunk)}")
36
+ volume_norm = np.linalg.norm(chunk)
37
+ length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
38
+ s = datetime.now()
39
+ selected_scores, all_scores = identify_languages(chunk, language_set)
40
+ lang_id_time = (datetime.now() - s).total_seconds()
41
+
42
+ # 日本語と英語の確率値を取得
43
+ ja_prob = selected_scores['Japanese']
44
+ en_prob = selected_scores['English']
45
+
46
+ ja_en = 'ja' if ja_prob > en_prob else 'en'
47
+
48
+ # Top 3言語を取得
49
+ top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
50
+
51
+ # テキストの認識
52
+ s = datetime.now()
53
+ transcription = transcribe(chunk, language=ja_en)
54
+ transcribe_time = (datetime.now() - s).total_seconds()
55
+
56
+ return pd.DataFrame({
57
+ "Length (s)": [length],
58
+ "Volume": [volume_norm],
59
+ "Japanese_English": [f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})"],
60
+ "Language": [top3_languages],
61
+ "Lang ID Time": [lang_id_time],
62
+ "Transcribe Time": [transcribe_time],
63
+ "Text": [transcription],
64
+ })
65
+
66
+
67
+ def process_audio_stream(audio, chunk_duration, language_set):
68
+ global data_df, current_chunk, SAMPLING_RATE
69
+ print("Process_audio_stream")
70
+
71
+ if audio is None:
72
+ return None, data_df
73
+
74
+ sr, audio_data = audio
75
+
76
+ # language_set
77
+ language_set = [lang.strip() for lang in language_set.split(",")]
78
+ print(audio_data.shape, audio_data.dtype)
79
+ # 一番最初にSampling rateを揃えておく
80
+ audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
81
+ audio_sec = 0
82
+
83
+ # 音量の正規化
84
+ audio_data = normalize_audio(audio_data)
85
+
86
+ current_chunk.append(audio_data)
87
+
88
+ total_chunk = np.concatenate(current_chunk)
89
+
90
+ # CHUNK_DURATIONを超えていたら処理
91
+ if len(total_chunk) >= SAMPLING_RATE * chunk_duration:
92
+ chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
93
+ total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]
94
+ audio_sec += chunk_duration
95
+
96
+ df = process_chunk(chunk, language_set)
97
+ data_df = pd.concat([data_df, df], ignore_index=True)
98
+
99
+ current_chunk = [total_chunk]
100
+ return (SAMPLING_RATE, chunk), data_df
101
+ else:
102
+ return (SAMPLING_RATE, total_chunk), data_df
103
+
104
+
105
  def process_audio(audio, chunk_duration, language_set):
106
+ global data, data_df, current_chunk, SAMPLING_RATE
107
+ # reset state
108
+ data = []
109
+ current_chunk = []
110
+
111
  print("Process_audio")
112
  print(audio)
113
  if audio is None:
 
136
  audio_sec += chunk_duration
137
 
138
  print(f"Processing audio chunk of length {len(chunk)}")
139
+ df = process_chunk(chunk, language_set)
140
+ data_df = pd.concat([data_df, df], ignore_index=True)
141
+
142
+ yield (SAMPLING_RATE, chunk), data_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # 未処理の残りのデータを保持
145
  current_chunk = [total_chunk]
 
166
 
167
  with gr.TabItem("Microphone"):
168
  gr.Interface(
169
+ fn=process_audio_stream,
170
  inputs=inputs_stream,
171
  outputs=outputs,
172
  live=True,
whisper.py CHANGED
@@ -13,9 +13,12 @@ model.to(device)
13
  SAMPLING_RATE = 16000
14
 
15
 
16
- def transcribe(chunk: np.ndarray) -> str:
 
 
 
17
  input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
18
- predicted_ids = model.generate(input_features)
19
  transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
20
  print(transcriptions)
21
  return "\n".join(transcriptions)
 
13
  SAMPLING_RATE = 16000
14
 
15
 
16
+ def transcribe(chunk: np.ndarray, language: str = "en") -> str:
17
+ # 言語設定用のトークナイズオプションを設定
18
+ forced_decoder_ids = processor.tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
19
+
20
  input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
21
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
22
  transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
23
  print(transcriptions)
24
  return "\n".join(transcriptions)