Update app.py
Browse files
app.py
CHANGED
@@ -64,7 +64,8 @@ def diarize_audio(temp_file):
|
|
64 |
diarization = pipeline(temp_file)
|
65 |
except Exception as e:
|
66 |
return f"处理音频时出错: {e}"
|
67 |
-
|
|
|
68 |
# 返回 diarization 类对象
|
69 |
return diarization
|
70 |
|
@@ -85,12 +86,12 @@ def calculate_overlap(start1, end1, start2, end2):
|
|
85 |
return overlap_duration
|
86 |
|
87 |
# 获取目标时间段和说话人时间段的重叠比例
|
88 |
-
def
|
89 |
target_start_time = target_time['start_time']
|
90 |
target_end_time = target_time['end_time']
|
91 |
|
92 |
-
#
|
93 |
-
speaker_segments =
|
94 |
for speech_turn in diarization_output.itertracks(yield_label=True): # 使用 itertracks 获取每个说话人的信息
|
95 |
start_seconds = speech_turn[0].start
|
96 |
end_seconds = speech_turn[0].end
|
@@ -98,15 +99,17 @@ def get_best_match(target_time, diarization_output):
|
|
98 |
|
99 |
# 计算目标音频时间段和说话人时间段的重叠时间
|
100 |
overlap = calculate_overlap(target_start_time, target_end_time, start_seconds, end_seconds)
|
101 |
-
overlap_ratio = overlap / (end_seconds - start_seconds)
|
102 |
|
103 |
-
#
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
best_match = max(speaker_segments, key=lambda x: x[1], default=None)
|
108 |
-
|
109 |
-
return best_match
|
110 |
|
111 |
# 处理音频文件并返回输出
|
112 |
def process_audio(target_audio, mixed_audio):
|
@@ -122,18 +125,20 @@ def process_audio(target_audio, mixed_audio):
|
|
122 |
if isinstance(diarization_result, str) and diarization_result.startswith("错误"):
|
123 |
return diarization_result, None # 出错时返回错误信息
|
124 |
else:
|
125 |
-
#
|
126 |
-
|
127 |
|
128 |
-
if
|
129 |
-
#
|
130 |
-
return
|
|
|
|
|
131 |
|
132 |
# Gradio 接口
|
133 |
with gr.Blocks() as demo:
|
134 |
gr.Markdown("""
|
135 |
# 🗣️ 音频拼接与说话人分类 🗣️
|
136 |
-
|
137 |
""")
|
138 |
|
139 |
mixed_audio_input = gr.Audio(type="filepath", label="上传混合音频")
|
@@ -142,14 +147,13 @@ with gr.Blocks() as demo:
|
|
142 |
process_button = gr.Button("处理音频")
|
143 |
|
144 |
# 输出结果
|
145 |
-
diarization_output = gr.Textbox(label="
|
146 |
-
time_range_output = gr.Textbox(label="最佳匹配时间段")
|
147 |
|
148 |
# 点击按钮时触发处理音频
|
149 |
process_button.click(
|
150 |
fn=process_audio,
|
151 |
inputs=[target_audio_input, mixed_audio_input],
|
152 |
-
outputs=[diarization_output
|
153 |
)
|
154 |
|
155 |
demo.launch(share=True)
|
|
|
64 |
diarization = pipeline(temp_file)
|
65 |
except Exception as e:
|
66 |
return f"处理音频时出错: {e}"
|
67 |
+
|
68 |
+
print(diarization)
|
69 |
# 返回 diarization 类对象
|
70 |
return diarization
|
71 |
|
|
|
86 |
return overlap_duration
|
87 |
|
88 |
# 获取目标时间段和说话人时间段的重叠比例
|
89 |
+
def get_matching_segments(target_time, diarization_output):
|
90 |
target_start_time = target_time['start_time']
|
91 |
target_end_time = target_time['end_time']
|
92 |
|
93 |
+
# 获取该说话人时间段的信息,排除目标录音时间段
|
94 |
+
speaker_segments = {}
|
95 |
for speech_turn in diarization_output.itertracks(yield_label=True): # 使用 itertracks 获取每个说话人的信息
|
96 |
start_seconds = speech_turn[0].start
|
97 |
end_seconds = speech_turn[0].end
|
|
|
99 |
|
100 |
# 计算目标音频时间段和说话人时间段的重叠时间
|
101 |
overlap = calculate_overlap(target_start_time, target_end_time, start_seconds, end_seconds)
|
|
|
102 |
|
103 |
+
# 如果存在重叠,排除目标音频时间段
|
104 |
+
if overlap > 0:
|
105 |
+
if label not in speaker_segments:
|
106 |
+
speaker_segments[label] = []
|
107 |
+
|
108 |
+
# 如果时间段与目标音频有重叠,跳过该时间段
|
109 |
+
if start_seconds >= target_end_time or end_seconds <= target_start_time:
|
110 |
+
speaker_segments[label].append((start_seconds, end_seconds))
|
111 |
|
112 |
+
return speaker_segments
|
|
|
|
|
|
|
113 |
|
114 |
# 处理音频文件并返回输出
|
115 |
def process_audio(target_audio, mixed_audio):
|
|
|
125 |
if isinstance(diarization_result, str) and diarization_result.startswith("错误"):
|
126 |
return diarization_result, None # 出错时返回错误信息
|
127 |
else:
|
128 |
+
# 获取该说话人的所有匹配时间段(排除目标音频时间段)
|
129 |
+
matching_segments = get_matching_segments(time_dict, diarization_result)
|
130 |
|
131 |
+
if matching_segments:
|
132 |
+
# 返回匹配的说话人标签和他们的时间段
|
133 |
+
return matching_segments
|
134 |
+
else:
|
135 |
+
return "没有找到匹配的说话人时间段。"
|
136 |
|
137 |
# Gradio 接口
|
138 |
with gr.Blocks() as demo:
|
139 |
gr.Markdown("""
|
140 |
# 🗣️ 音频拼接与说话人分类 🗣️
|
141 |
+
上传目标音频和混合音频,拼接并进行说话人分类。结果包括所有匹配说话人的时间段(排除目标录音时间段)。
|
142 |
""")
|
143 |
|
144 |
mixed_audio_input = gr.Audio(type="filepath", label="上传混合音频")
|
|
|
147 |
process_button = gr.Button("处理音频")
|
148 |
|
149 |
# 输出结果
|
150 |
+
diarization_output = gr.Textbox(label="匹配的说话人时间段")
|
|
|
151 |
|
152 |
# 点击按钮时触发处理音频
|
153 |
process_button.click(
|
154 |
fn=process_audio,
|
155 |
inputs=[target_audio_input, mixed_audio_input],
|
156 |
+
outputs=[diarization_output]
|
157 |
)
|
158 |
|
159 |
demo.launch(share=True)
|