Spaces:
Running
on
Zero
Running
on
Zero
Zero-gpu
Browse files- app.py +37 -27
- requirements.txt +1 -0
app.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
import time
|
3 |
|
4 |
import gradio as gr
|
|
|
|
|
5 |
from pydub import AudioSegment
|
6 |
from transformers import pipeline
|
7 |
|
@@ -28,44 +30,52 @@ model_dict = {
|
|
28 |
),
|
29 |
}
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
|
|
36 |
def transcribe_common(audio: str, model: str) -> tuple[str, float]:
|
|
|
37 |
# Get duration of audio
|
38 |
duration = AudioSegment.from_file(audio).duration_seconds
|
39 |
if duration > 15:
|
40 |
return "Audio too long, limit is 15 seconds", 0
|
41 |
start_time = time.time()
|
42 |
-
|
43 |
end_time = time.time()
|
44 |
-
return
|
45 |
|
46 |
|
47 |
def transcribe_large_v2(audio) -> tuple[str, float]:
|
48 |
-
return transcribe_common(audio,
|
49 |
|
50 |
|
51 |
def transcribe_large_v3(audio) -> tuple[str, float]:
|
52 |
-
return transcribe_common(audio,
|
53 |
|
54 |
|
55 |
def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
|
56 |
-
return transcribe_common(audio,
|
57 |
|
58 |
|
59 |
def transcribe_kotoba_v1(audio) -> tuple[str, float]:
|
60 |
-
return transcribe_common(audio,
|
61 |
|
62 |
|
63 |
def transcribe_kotoba_v2(audio) -> tuple[str, float]:
|
64 |
-
return transcribe_common(audio,
|
65 |
|
66 |
|
67 |
def transcribe_galgame_whisper(audio) -> tuple[str, float]:
|
68 |
-
return transcribe_common(audio,
|
69 |
|
70 |
|
71 |
initial_md = """
|
@@ -91,39 +101,39 @@ generate_kwargs = {
|
|
91 |
with gr.Blocks() as app:
|
92 |
gr.Markdown(initial_md)
|
93 |
audio = gr.Audio(type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
with gr.Row():
|
95 |
with gr.Column():
|
96 |
gr.Markdown("### Whisper-Large-V2")
|
97 |
button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
|
98 |
-
|
99 |
-
|
100 |
with gr.Column():
|
101 |
gr.Markdown("### Whisper-Large-V3")
|
102 |
button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
|
103 |
-
|
104 |
-
|
105 |
with gr.Column():
|
106 |
gr.Markdown("### Whisper-Large-V3-Turbo")
|
107 |
button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
|
108 |
-
|
109 |
-
|
110 |
with gr.Row():
|
111 |
with gr.Column():
|
112 |
gr.Markdown("### Kotoba-Whisper-V1.0")
|
113 |
button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
|
114 |
-
|
115 |
-
|
116 |
with gr.Column():
|
117 |
gr.Markdown("### Kotoba-Whisper-V2.0")
|
118 |
button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
|
119 |
-
|
120 |
-
|
121 |
-
with gr.Row():
|
122 |
-
with gr.Column():
|
123 |
-
gr.Markdown("### Galgame-Whisper (WIP)")
|
124 |
-
button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
|
125 |
-
output_galgame = gr.Textbox()
|
126 |
-
time_galgame = gr.Textbox("Time taken")
|
127 |
|
128 |
button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
|
129 |
button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
|
|
|
2 |
import time
|
3 |
|
4 |
import gradio as gr
|
5 |
+
import spaces
|
6 |
+
import torch
|
7 |
from pydub import AudioSegment
|
8 |
from transformers import pipeline
|
9 |
|
|
|
30 |
),
|
31 |
}
|
32 |
|
33 |
+
print("Initializing pipelines...")
|
34 |
+
pipe_dict = {
|
35 |
+
k: pipeline(
|
36 |
+
"automatic-speech-recognition",
|
37 |
+
model=v,
|
38 |
+
device="cuda" if torch.cuda.is_available() or is_hf else "cpu",
|
39 |
+
)
|
40 |
+
for k, v in model_dict.items()
|
41 |
+
}
|
42 |
|
43 |
|
44 |
+
@spaces.GPU
|
45 |
def transcribe_common(audio: str, model: str) -> tuple[str, float]:
|
46 |
+
print(f"Transcribing {audio} with {model}")
|
47 |
# Get duration of audio
|
48 |
duration = AudioSegment.from_file(audio).duration_seconds
|
49 |
if duration > 15:
|
50 |
return "Audio too long, limit is 15 seconds", 0
|
51 |
start_time = time.time()
|
52 |
+
result = pipe_dict[model](audio, generate_kwargs=generate_kwargs)["text"]
|
53 |
end_time = time.time()
|
54 |
+
return result, end_time - start_time
|
55 |
|
56 |
|
57 |
def transcribe_large_v2(audio) -> tuple[str, float]:
|
58 |
+
return transcribe_common(audio, "whisper-large-v2")
|
59 |
|
60 |
|
61 |
def transcribe_large_v3(audio) -> tuple[str, float]:
|
62 |
+
return transcribe_common(audio, "whisper-large-v3")
|
63 |
|
64 |
|
65 |
def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
|
66 |
+
return transcribe_common(audio, "whisper-large-v3-turbo")
|
67 |
|
68 |
|
69 |
def transcribe_kotoba_v1(audio) -> tuple[str, float]:
|
70 |
+
return transcribe_common(audio, "kotoba-whisper-v1.0")
|
71 |
|
72 |
|
73 |
def transcribe_kotoba_v2(audio) -> tuple[str, float]:
|
74 |
+
return transcribe_common(audio, "kotoba-whisper-v2.0")
|
75 |
|
76 |
|
77 |
def transcribe_galgame_whisper(audio) -> tuple[str, float]:
|
78 |
+
return transcribe_common(audio, "galgame-whisper-wip")
|
79 |
|
80 |
|
81 |
initial_md = """
|
|
|
101 |
with gr.Blocks() as app:
|
102 |
gr.Markdown(initial_md)
|
103 |
audio = gr.Audio(type="filepath")
|
104 |
+
with gr.Row():
|
105 |
+
with gr.Column():
|
106 |
+
gr.Markdown("### Galgame-Whisper (WIP)")
|
107 |
+
button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
|
108 |
+
time_galgame = gr.Textbox(label="Time taken")
|
109 |
+
output_galgame = gr.Textbox(label="Result")
|
110 |
with gr.Row():
|
111 |
with gr.Column():
|
112 |
gr.Markdown("### Whisper-Large-V2")
|
113 |
button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
|
114 |
+
time_v2 = gr.Textbox(label="Time taken")
|
115 |
+
output_v2 = gr.Textbox(label="Result")
|
116 |
with gr.Column():
|
117 |
gr.Markdown("### Whisper-Large-V3")
|
118 |
button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
|
119 |
+
time_v3 = gr.Textbox(label="Time taken")
|
120 |
+
output_v3 = gr.Textbox(label="Result")
|
121 |
with gr.Column():
|
122 |
gr.Markdown("### Whisper-Large-V3-Turbo")
|
123 |
button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
|
124 |
+
time_v3_turbo = gr.Textbox(label="Time taken")
|
125 |
+
output_v3_turbo = gr.Textbox(label="Result")
|
126 |
with gr.Row():
|
127 |
with gr.Column():
|
128 |
gr.Markdown("### Kotoba-Whisper-V1.0")
|
129 |
button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
|
130 |
+
time_kotoba_v1 = gr.Textbox(label="Time taken")
|
131 |
+
output_kotoba_v1 = gr.Textbox(label="Result")
|
132 |
with gr.Column():
|
133 |
gr.Markdown("### Kotoba-Whisper-V2.0")
|
134 |
button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
|
135 |
+
time_kotoba_v2 = gr.Textbox(label="Time taken")
|
136 |
+
output_kotoba_v2 = gr.Textbox(label="Result")
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
|
139 |
button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
gradio
|
2 |
numpy<2
|
|
|
3 |
torch
|
4 |
transformers
|
|
|
1 |
gradio
|
2 |
numpy<2
|
3 |
+
spaces
|
4 |
torch
|
5 |
transformers
|