import os import time from pathlib import Path import gradio as gr import librosa import spaces import torch from loguru import logger from transformers import pipeline is_hf = os.getenv("SYSTEM") == "spaces" generate_kwargs = { "language": "Japanese", "do_sample": False, "num_beams": 1, "no_repeat_ngram_size": 3, } model_dict = { "whisper-large-v2": "openai/whisper-large-v2", "whisper-large-v3": "openai/whisper-large-v3", "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo", "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0", "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0", "galgame-whisper-wip": ( "litagin/galgame-whisper-wip" if is_hf else "../whisper_finetune/galgame-whisper" ), } logger.info("Initializing pipelines...") pipe_dict = { k: pipeline( "automatic-speech-recognition", model=v, device="cuda" if torch.cuda.is_available() else "cpu", ) for k, v in model_dict.items() } logger.success("Pipelines initialized!") @spaces.GPU def transcribe_common(audio: str, model: str) -> tuple[str, float]: logger.info(f"Transcribing {Path(audio).name} with {model}") # Read and resample audio to 16kHz y, sr = librosa.load(audio, mono=True, sr=16000) # Get duration of audio duration = librosa.get_duration(y=y, sr=sr) logger.info(f"Duration: {duration:.2f}s") if duration > 15: return "Audio too long, limit is 15 seconds", 0 start_time = time.time() result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"] end_time = time.time() logger.success(f"Transcribed {audio} with {model} in {end_time - start_time:.2f}s") logger.success(f"Result:\n{result}") return result, end_time - start_time def transcribe_large_v2(audio) -> tuple[str, float]: return transcribe_common(audio, "whisper-large-v2") def transcribe_large_v3(audio) -> tuple[str, float]: return transcribe_common(audio, "whisper-large-v3") def transcribe_large_v3_turbo(audio) -> tuple[str, float]: return transcribe_common(audio, "whisper-large-v3-turbo") def transcribe_kotoba_v1(audio) -> tuple[str, float]: return transcribe_common(audio, "kotoba-whisper-v1.0") def transcribe_kotoba_v2(audio) -> tuple[str, float]: return transcribe_common(audio, "kotoba-whisper-v2.0") def transcribe_galgame_whisper(audio) -> tuple[str, float]: return transcribe_common(audio, "galgame-whisper-wip") def warmup(): logger.info("Warm-up...") return transcribe_large_v3_turbo("test.wav") initial_md = """ # Galgame-Whisper (WIP) Demo - https://huggingface.co/litagin/galgame-whisper-wip - 日本語のみ対応 - 比較できるように他モデルもついでに試せる - 現在0.1エポックくらい - 音声は15秒まで pipeに渡しているkwargsは以下の通り: ```python generate_kwargs = { "language": "Japanese", "do_sample": False, "num_beams": 1, "no_repeat_ngram_size": 3, } ``` """ with gr.Blocks() as app: gr.Markdown(initial_md) audio = gr.Audio(type="filepath") with gr.Row(): with gr.Column(): gr.Markdown("### Galgame-Whisper (WIP)") button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)") time_galgame = gr.Textbox(label="Time taken") output_galgame = gr.Textbox(label="Result") with gr.Row(): with gr.Column(): gr.Markdown("### Whisper-Large-V2") button_v2 = gr.Button("Transcribe with Whisper-Large-V2") time_v2 = gr.Textbox(label="Time taken") output_v2 = gr.Textbox(label="Result") with gr.Column(): gr.Markdown("### Whisper-Large-V3") button_v3 = gr.Button("Transcribe with Whisper-Large-V3") time_v3 = gr.Textbox(label="Time taken") output_v3 = gr.Textbox(label="Result") with gr.Column(): gr.Markdown("### Whisper-Large-V3-Turbo") button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo") time_v3_turbo = gr.Textbox(label="Time taken") output_v3_turbo = gr.Textbox(label="Result") with gr.Row(): with gr.Column(): gr.Markdown("### Kotoba-Whisper-V1.0") button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0") time_kotoba_v1 = gr.Textbox(label="Time taken") output_kotoba_v1 = gr.Textbox(label="Result") with gr.Column(): gr.Markdown("### Kotoba-Whisper-V2.0") button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0") time_kotoba_v2 = gr.Textbox(label="Time taken") output_kotoba_v2 = gr.Textbox(label="Result") warmup_result = gr.Textbox(label="Warm-up result", visible=False) button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2]) button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3]) button_v3_turbo.click( transcribe_large_v3_turbo, inputs=audio, outputs=[output_v3_turbo, time_v3_turbo], ) button_kotoba_v1.click( transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1] ) button_kotoba_v2.click( transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2] ) button_galgame.click( transcribe_galgame_whisper, inputs=audio, outputs=[output_galgame, time_galgame], ) app.load(warmup, inputs=[], outputs=[warmup_result], queue=True) app.launch(inbrowser=True)