litagin commited on
Commit
bef66de
·
1 Parent(s): 66b8eb3
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +137 -0
  3. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv/
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import gradio as gr
5
+ from pydub import AudioSegment
6
+ from transformers import pipeline
7
+
8
+ is_hf = os.getenv("SYSTEM") == "spaces"
9
+
10
+ generate_kwargs = {
11
+ "language": "Japanese",
12
+ "do_sample": False,
13
+ "num_beams": 1,
14
+ "no_repeat_ngram_size": 3,
15
+ }
16
+
17
+
18
+ model_dict = {
19
+ "whisper-large-v2": "openai/whisper-large-v2",
20
+ "whisper-large-v3": "openai/whisper-large-v3",
21
+ "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
22
+ "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
23
+ "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
24
+ "galgame-whisper-wip": (
25
+ "litagin/galgame-whisper-wip"
26
+ if is_hf
27
+ else "../whisper_finetune/galgame-whisper"
28
+ ),
29
+ }
30
+
31
+ # Download models
32
+ for model in model_dict.values():
33
+ pipeline("automatic-speech-recognition", model=model)
34
+
35
+
36
+ def transcribe_common(audio: str, model: str) -> tuple[str, float]:
37
+ # Get duration of audio
38
+ duration = AudioSegment.from_file(audio).duration_seconds
39
+ if duration > 15:
40
+ return "Audio too long, limit is 15 seconds", 0
41
+ start_time = time.time()
42
+ pipe = pipeline("automatic-speech-recognition", model=model)
43
+ end_time = time.time()
44
+ return pipe(audio, generate_kwargs=generate_kwargs)["text"], end_time - start_time
45
+
46
+
47
+ def transcribe_large_v2(audio) -> tuple[str, float]:
48
+ return transcribe_common(audio, model_dict["whisper-large-v2"])
49
+
50
+
51
+ def transcribe_large_v3(audio) -> tuple[str, float]:
52
+ return transcribe_common(audio, model_dict["whisper-large-v3"])
53
+
54
+
55
+ def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
56
+ return transcribe_common(audio, model_dict["whisper-large-v3-turbo"])
57
+
58
+
59
+ def transcribe_kotoba_v1(audio) -> tuple[str, float]:
60
+ return transcribe_common(audio, model_dict["kotoba-whisper-v1.0"])
61
+
62
+
63
+ def transcribe_kotoba_v2(audio) -> tuple[str, float]:
64
+ return transcribe_common(audio, model_dict["kotoba-whisper-v2.0"])
65
+
66
+
67
+ def transcribe_galgame_whisper(audio) -> tuple[str, float]:
68
+ return transcribe_common(audio, model_dict["galgame-whisper-wip"])
69
+
70
+
71
+ initial_md = """
72
+ # Galgame-Whisper (WIP) Demo
73
+
74
+ - 日本語のみ対応
75
+ - 他の書き起こしとついでに比較できるようにいろいろ入れた
76
+ - 現在0.1エポックくらい
77
+ - 速度はCPUです
78
+ - 音声は15秒まで
79
+ """
80
+
81
+ with gr.Blocks() as app:
82
+ audio = gr.Audio(type="filepath")
83
+ gr.Markdown("### Kotoba-Whisper-V1.0")
84
+ with gr.Row():
85
+ with gr.Column():
86
+ gr.Markdown("### Whisper-Large-V2")
87
+ button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
88
+ output_v2 = gr.Textbox()
89
+ time_v2 = gr.Textbox("Time taken")
90
+ with gr.Column():
91
+ gr.Markdown("### Whisper-Large-V3")
92
+ button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
93
+ output_v3 = gr.Textbox()
94
+ time_v3 = gr.Textbox("Time taken")
95
+ with gr.Column():
96
+ gr.Markdown("### Whisper-Large-V3-Turbo")
97
+ button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
98
+ output_v3_turbo = gr.Textbox()
99
+ time_v3_turbo = gr.Textbox()
100
+ with gr.Row():
101
+ with gr.Column():
102
+ gr.Markdown("### Kotoba-Whisper-V1.0")
103
+ button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
104
+ output_kotoba_v1 = gr.Textbox()
105
+ time_kotoba_v1 = gr.Textbox("Time taken")
106
+ with gr.Column():
107
+ gr.Markdown("### Kotoba-Whisper-V2.0")
108
+ button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
109
+ output_kotoba_v2 = gr.Textbox()
110
+ time_kotoba_v2 = gr.Textbox("Time taken")
111
+ with gr.Row():
112
+ with gr.Column():
113
+ gr.Markdown("### Galgame-Whisper (WIP)")
114
+ button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
115
+ output_galgame = gr.Textbox()
116
+ time_galgame = gr.Textbox("Time taken")
117
+
118
+ button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
119
+ button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
120
+ button_v3_turbo.click(
121
+ transcribe_large_v3_turbo,
122
+ inputs=audio,
123
+ outputs=[output_v3_turbo, time_v3_turbo],
124
+ )
125
+ button_kotoba_v1.click(
126
+ transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1]
127
+ )
128
+ button_kotoba_v2.click(
129
+ transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
130
+ )
131
+ button_galgame.click(
132
+ transcribe_galgame_whisper,
133
+ inputs=audio,
134
+ outputs=[output_galgame, time_galgame],
135
+ )
136
+
137
+ app.launch(inbrowser=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ numpy<2
3
+ torch
4
+ transformers