AkitoP's picture
fix
5024e84
raw
history blame
1.23 kB
import os
import time
import warnings
from pathlib import Path
import gradio as gr
import librosa
import spaces
import torch
from transformers import pipeline, WhisperConfig
warnings.filterwarnings("ignore")
is_hf = os.getenv("SYSTEM") == "spaces"
# reference from litagin / galgame-whisper-wip
generate_kwargs = {
"max_new_tokens": 256,
}
pipe = pipeline(
"automatic-speech-recognition",
model="AkitoP/whisper-large-v3-japense-phone_accent",
device="cuda" if torch.cuda.is_available() else "cpu",
)
@spaces.GPU
def transcribe(audio: str) -> str:
result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
print(result)
return result
initial_md = """
# Whisper Large V3 Japanese Phone Accent
A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
"""
with gr.Blocks() as app:
gr.Markdown(initial_md)
audio = gr.Audio(type="filepath")
transcribe_btn = gr.Button("Transcribe")
output = gr.Textbox(label="Result")
transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output])
app.launch(inbrowser=True)