litagin commited on
Commit
d01f68f
β€’
1 Parent(s): 409084a
Files changed (2) hide show
  1. app.py +37 -27
  2. requirements.txt +1 -0
app.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import time
3
 
4
  import gradio as gr
 
 
5
  from pydub import AudioSegment
6
  from transformers import pipeline
7
 
@@ -28,44 +30,52 @@ model_dict = {
28
  ),
29
  }
30
 
31
- # Download models
32
- for model in model_dict.values():
33
- pipeline("automatic-speech-recognition", model=model)
 
 
 
 
 
 
34
 
35
 
 
36
  def transcribe_common(audio: str, model: str) -> tuple[str, float]:
 
37
  # Get duration of audio
38
  duration = AudioSegment.from_file(audio).duration_seconds
39
  if duration > 15:
40
  return "Audio too long, limit is 15 seconds", 0
41
  start_time = time.time()
42
- pipe = pipeline("automatic-speech-recognition", model=model)
43
  end_time = time.time()
44
- return pipe(audio, generate_kwargs=generate_kwargs)["text"], end_time - start_time
45
 
46
 
47
  def transcribe_large_v2(audio) -> tuple[str, float]:
48
- return transcribe_common(audio, model_dict["whisper-large-v2"])
49
 
50
 
51
  def transcribe_large_v3(audio) -> tuple[str, float]:
52
- return transcribe_common(audio, model_dict["whisper-large-v3"])
53
 
54
 
55
  def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
56
- return transcribe_common(audio, model_dict["whisper-large-v3-turbo"])
57
 
58
 
59
  def transcribe_kotoba_v1(audio) -> tuple[str, float]:
60
- return transcribe_common(audio, model_dict["kotoba-whisper-v1.0"])
61
 
62
 
63
  def transcribe_kotoba_v2(audio) -> tuple[str, float]:
64
- return transcribe_common(audio, model_dict["kotoba-whisper-v2.0"])
65
 
66
 
67
  def transcribe_galgame_whisper(audio) -> tuple[str, float]:
68
- return transcribe_common(audio, model_dict["galgame-whisper-wip"])
69
 
70
 
71
  initial_md = """
@@ -91,39 +101,39 @@ generate_kwargs = {
91
  with gr.Blocks() as app:
92
  gr.Markdown(initial_md)
93
  audio = gr.Audio(type="filepath")
 
 
 
 
 
 
94
  with gr.Row():
95
  with gr.Column():
96
  gr.Markdown("### Whisper-Large-V2")
97
  button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
98
- output_v2 = gr.Textbox()
99
- time_v2 = gr.Textbox("Time taken")
100
  with gr.Column():
101
  gr.Markdown("### Whisper-Large-V3")
102
  button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
103
- output_v3 = gr.Textbox()
104
- time_v3 = gr.Textbox("Time taken")
105
  with gr.Column():
106
  gr.Markdown("### Whisper-Large-V3-Turbo")
107
  button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
108
- output_v3_turbo = gr.Textbox()
109
- time_v3_turbo = gr.Textbox()
110
  with gr.Row():
111
  with gr.Column():
112
  gr.Markdown("### Kotoba-Whisper-V1.0")
113
  button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
114
- output_kotoba_v1 = gr.Textbox()
115
- time_kotoba_v1 = gr.Textbox("Time taken")
116
  with gr.Column():
117
  gr.Markdown("### Kotoba-Whisper-V2.0")
118
  button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
119
- output_kotoba_v2 = gr.Textbox()
120
- time_kotoba_v2 = gr.Textbox("Time taken")
121
- with gr.Row():
122
- with gr.Column():
123
- gr.Markdown("### Galgame-Whisper (WIP)")
124
- button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
125
- output_galgame = gr.Textbox()
126
- time_galgame = gr.Textbox("Time taken")
127
 
128
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
129
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
 
2
  import time
3
 
4
  import gradio as gr
5
+ import spaces
6
+ import torch
7
  from pydub import AudioSegment
8
  from transformers import pipeline
9
 
 
30
  ),
31
  }
32
 
33
+ print("Initializing pipelines...")
34
+ pipe_dict = {
35
+ k: pipeline(
36
+ "automatic-speech-recognition",
37
+ model=v,
38
+ device="cuda" if torch.cuda.is_available() or is_hf else "cpu",
39
+ )
40
+ for k, v in model_dict.items()
41
+ }
42
 
43
 
44
+ @spaces.GPU
45
  def transcribe_common(audio: str, model: str) -> tuple[str, float]:
46
+ print(f"Transcribing {audio} with {model}")
47
  # Get duration of audio
48
  duration = AudioSegment.from_file(audio).duration_seconds
49
  if duration > 15:
50
  return "Audio too long, limit is 15 seconds", 0
51
  start_time = time.time()
52
+ result = pipe_dict[model](audio, generate_kwargs=generate_kwargs)["text"]
53
  end_time = time.time()
54
+ return result, end_time - start_time
55
 
56
 
57
  def transcribe_large_v2(audio) -> tuple[str, float]:
58
+ return transcribe_common(audio, "whisper-large-v2")
59
 
60
 
61
  def transcribe_large_v3(audio) -> tuple[str, float]:
62
+ return transcribe_common(audio, "whisper-large-v3")
63
 
64
 
65
  def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
66
+ return transcribe_common(audio, "whisper-large-v3-turbo")
67
 
68
 
69
  def transcribe_kotoba_v1(audio) -> tuple[str, float]:
70
+ return transcribe_common(audio, "kotoba-whisper-v1.0")
71
 
72
 
73
  def transcribe_kotoba_v2(audio) -> tuple[str, float]:
74
+ return transcribe_common(audio, "kotoba-whisper-v2.0")
75
 
76
 
77
  def transcribe_galgame_whisper(audio) -> tuple[str, float]:
78
+ return transcribe_common(audio, "galgame-whisper-wip")
79
 
80
 
81
  initial_md = """
 
101
  with gr.Blocks() as app:
102
  gr.Markdown(initial_md)
103
  audio = gr.Audio(type="filepath")
104
+ with gr.Row():
105
+ with gr.Column():
106
+ gr.Markdown("### Galgame-Whisper (WIP)")
107
+ button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
108
+ time_galgame = gr.Textbox(label="Time taken")
109
+ output_galgame = gr.Textbox(label="Result")
110
  with gr.Row():
111
  with gr.Column():
112
  gr.Markdown("### Whisper-Large-V2")
113
  button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
114
+ time_v2 = gr.Textbox(label="Time taken")
115
+ output_v2 = gr.Textbox(label="Result")
116
  with gr.Column():
117
  gr.Markdown("### Whisper-Large-V3")
118
  button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
119
+ time_v3 = gr.Textbox(label="Time taken")
120
+ output_v3 = gr.Textbox(label="Result")
121
  with gr.Column():
122
  gr.Markdown("### Whisper-Large-V3-Turbo")
123
  button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
124
+ time_v3_turbo = gr.Textbox(label="Time taken")
125
+ output_v3_turbo = gr.Textbox(label="Result")
126
  with gr.Row():
127
  with gr.Column():
128
  gr.Markdown("### Kotoba-Whisper-V1.0")
129
  button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
130
+ time_kotoba_v1 = gr.Textbox(label="Time taken")
131
+ output_kotoba_v1 = gr.Textbox(label="Result")
132
  with gr.Column():
133
  gr.Markdown("### Kotoba-Whisper-V2.0")
134
  button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
135
+ time_kotoba_v2 = gr.Textbox(label="Time taken")
136
+ output_kotoba_v2 = gr.Textbox(label="Result")
 
 
 
 
 
 
137
 
138
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
139
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
2
  numpy<2
 
3
  torch
4
  transformers
 
1
  gradio
2
  numpy<2
3
+ spaces
4
  torch
5
  transformers