QLWD commited on
Commit
6d3bc8f
1 Parent(s): d8b6286

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -32
app.py CHANGED
@@ -2,15 +2,14 @@ import torch
2
  import spaces
3
  import gradio as gr
4
  import os
 
5
  from pydub import AudioSegment
6
- from pyannote.audio.pipelines import SpeakerDiarization
7
-
8
 
9
  # 初始化 pyannote/speaker-diarization 模型
10
  HF_TOKEN = os.environ.get("HUGGINGFACE_READ_TOKEN")
11
  pipeline = None
12
  try:
13
- pipeline = SpeakerDiarization.from_pretrained(
14
  "pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN
15
  )
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -19,16 +18,16 @@ except Exception as e:
19
  print(f"Error initializing pipeline: {e}")
20
  pipeline = None
21
 
22
- # 音频处理函数:拼接目标音频和混合音频
23
  def combine_audio_with_time(target_audio, mixed_audio):
24
  if pipeline is None:
25
  return "错误: 模型未初始化"
26
 
27
  # 加载目标说话人的样本音频
28
- target_audio_segment = AudioSegment.from_wav(target_audio.name)
29
 
30
  # 加载混合音频
31
- mixed_audio_segment = AudioSegment.from_wav(mixed_audio.name)
32
 
33
  # 记录目标说话人音频的时间点(精确到0.01秒)
34
  target_start_time = len(mixed_audio_segment) / 1000 # 秒为单位,精确到 0.01 秒
@@ -42,7 +41,7 @@ def combine_audio_with_time(target_audio, mixed_audio):
42
  return "final_output.wav", target_start_time
43
 
44
  # 使用 pyannote/speaker-diarization 对拼接后的音频进行说话人分离
45
- @spaces.GPU(duration=60 * 2)
46
  def diarize_audio(temp_file):
47
  if pipeline is None:
48
  return "错误: 模型未初始化"
@@ -55,7 +54,7 @@ def diarize_audio(temp_file):
55
  # 返回 diarization 输出
56
  return str(diarization)
57
 
58
- # 处理并生成标签文件
59
  def generate_labels_from_diarization(diarization_output):
60
  labels_path = 'labels.txt'
61
  successful_lines = 0
@@ -89,47 +88,46 @@ def timestamp_to_seconds(timestamp):
89
  except ValueError as e:
90
  print(f"转换时间戳时出错: '{timestamp}'. 错误: {e}")
91
  return None
92
-
93
- @spaces.GPU(duration=60 * 2)
94
- # 处理音频文件
95
- def process_audio(audio):
96
- diarization_result = diarize_audio(save_audio(audio))
 
 
 
 
97
  if diarization_result.startswith("错误"):
98
- return diarization_result, None # 如果出错,返回错误信息和空的标签文件
99
  else:
 
100
  label_file = generate_labels_from_diarization(diarization_result)
101
- return diarization_result, label_file
102
-
103
- # 保存上传的音频
104
- def save_audio(audio):
105
- with open(audio.name, "rb") as f:
106
- audio_data = f.read()
107
-
108
- # 保存上传的音频文件到临时位置
109
- with open("temp.wav", "wb") as f:
110
- f.write(audio_data)
111
-
112
- return "temp.wav"
113
 
114
  # Gradio 接口
115
  with gr.Blocks() as demo:
116
  gr.Markdown("""
117
  # 🗣️ 音频拼接与说话人分类 🗣️
118
- 上传目标说话人音频和混合音频,拼接并进行说话人分类。
119
  """)
120
 
121
- audio_input = gr.Audio(type="filepath", label="上传目标说话人音频")
122
  mixed_audio_input = gr.Audio(type="filepath", label="上传混合音频")
123
-
124
  process_button = gr.Button("处理音频")
 
 
125
  diarization_output = gr.Textbox(label="说话人分离结果")
126
  label_file_link = gr.File(label="下载标签文件")
 
 
 
127
 
128
- # 处理音频
129
  process_button.click(
130
  fn=process_audio,
131
- inputs=[audio_input],
132
- outputs=[diarization_output, label_file_link]
133
  )
134
 
135
  demo.launch(share=False)
 
2
  import spaces
3
  import gradio as gr
4
  import os
5
+ from pyannote.audio import Pipeline
6
  from pydub import AudioSegment
 
 
7
 
8
  # 初始化 pyannote/speaker-diarization 模型
9
  HF_TOKEN = os.environ.get("HUGGINGFACE_READ_TOKEN")
10
  pipeline = None
11
  try:
12
+ pipeline = Pipeline.from_pretrained(
13
  "pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN
14
  )
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
18
  print(f"Error initializing pipeline: {e}")
19
  pipeline = None
20
 
21
+ # 音频拼接函数:拼接目标音频和混合音频
22
  def combine_audio_with_time(target_audio, mixed_audio):
23
  if pipeline is None:
24
  return "错误: 模型未初始化"
25
 
26
  # 加载目标说话人的样本音频
27
+ target_audio_segment = AudioSegment.from_wav(target_audio)
28
 
29
  # 加载混合音频
30
+ mixed_audio_segment = AudioSegment.from_wav(mixed_audio)
31
 
32
  # 记录目标说话人音频的时间点(精确到0.01秒)
33
  target_start_time = len(mixed_audio_segment) / 1000 # 秒为单位,精确到 0.01 秒
 
41
  return "final_output.wav", target_start_time
42
 
43
  # 使用 pyannote/speaker-diarization 对拼接后的音频进行说话人分离
44
+ @spaces.GPU(duration=60 * 2) # 使用 GPU 加速,限制执行时间为 120 秒
45
  def diarize_audio(temp_file):
46
  if pipeline is None:
47
  return "错误: 模型未初始化"
 
54
  # 返回 diarization 输出
55
  return str(diarization)
56
 
57
+ # 生成标签文件的函数
58
  def generate_labels_from_diarization(diarization_output):
59
  labels_path = 'labels.txt'
60
  successful_lines = 0
 
88
  except ValueError as e:
89
  print(f"转换时间戳时出错: '{timestamp}'. 错误: {e}")
90
  return None
91
+
92
+ # 处理音频文件并返回输出
93
+ def process_audio(target_audio, mixed_audio):
94
+ # 进行音频拼接
95
+ final_audio_path, target_start_time = combine_audio_with_time(target_audio, mixed_audio)
96
+
97
+ # 执行说话人分离
98
+ diarization_result = diarize_audio(final_audio_path)
99
+
100
  if diarization_result.startswith("错误"):
101
+ return diarization_result, None, None # 出错时返回错误信息
102
  else:
103
+ # 生成标签文件
104
  label_file = generate_labels_from_diarization(diarization_result)
105
+ return diarization_result, label_file, final_audio_path # 返回说话人分离结果、标签文件和剪辑后的音频路径
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # Gradio 接口
108
  with gr.Blocks() as demo:
109
  gr.Markdown("""
110
  # 🗣️ 音频拼接与说话人分类 🗣️
111
+ 上传目标说话人音频和混合音频,拼接并进行说话人分类。结果包括说话人分离输出、标签文件和剪辑后的音频文件。
112
  """)
113
 
114
+ target_audio_input = gr.Audio(type="filepath", label="上传目标说话人音频")
115
  mixed_audio_input = gr.Audio(type="filepath", label="上传混合音频")
116
+
117
  process_button = gr.Button("处理音频")
118
+
119
+ # 输出结果
120
  diarization_output = gr.Textbox(label="说话人分离结果")
121
  label_file_link = gr.File(label="下载标签文件")
122
+
123
+ # 修改为 gr.Audio 组件来返回音频
124
+ final_audio_link = gr.Audio(label="下载剪辑后的音频", type="file")
125
 
126
+ # 点击按钮时触发处理音频
127
  process_button.click(
128
  fn=process_audio,
129
+ inputs=[target_audio_input, mixed_audio_input],
130
+ outputs=[diarization_output, label_file_link, final_audio_link]
131
  )
132
 
133
  demo.launch(share=False)