Spaces:
Paused
Paused
import gradio as gr | |
import numpy as np | |
import torch | |
import transformers | |
from pathlib import Path | |
from transformers import pipeline | |
from transformers.utils import logging | |
# Log | |
#logging.set_verbosity_debug() | |
logger = logging.get_logger("transformers") | |
# Pipelines | |
## Automatic Speech Recognition | |
## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition | |
## Require ffmpeg to be installed | |
asr_device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
asr_model = "openai/whisper-tiny" | |
asr = pipeline( | |
"automatic-speech-recognition", | |
model=asr_model, | |
torch_dtype=asr_torch_dtype, | |
device=asr_device | |
) | |
## Token Classification / Name Entity Recognition | |
## https://huggingface.co/docs/transformers/task_summary#token-classification | |
tc_device = 0 if torch.cuda.is_available() else "cpu" | |
tc_model = "dslim/distilbert-NER" | |
tc = pipeline( | |
"token-classification", # ner | |
model=tc_model, | |
device=tc_device | |
) | |
# --- | |
# Transformers | |
# https://www.gradio.app/main/docs/gradio/audio#behavior | |
# As output component: expects audio data in any of these formats: | |
# - a str or pathlib.Path filepath | |
# - or URL to an audio file, | |
# - or a bytes object (recommended for streaming), | |
# - or a tuple of (sample rate in Hz, audio data as numpy array) | |
def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None): | |
if audio is None: | |
return "..." | |
# TODO Manage str/Path | |
logger.debug("Transcribe") | |
text = "" | |
# https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__ | |
# Whisper input format for tuple differ from output provided by gradio audio component | |
if asr_model.startswith("openai/whisper"): | |
sampling_rate, raw = audio | |
# Convert to mono if stereo | |
if raw.ndim > 1: | |
raw = raw.mean(axis=1) | |
raw = raw.astype(np.float32) | |
raw /= np.max(np.abs(raw)) | |
inputs = {"sampling_rate": sampling_rate, "raw": raw} if type(audio) is tuple else audio | |
transcript = asr(inputs) | |
text = transcript['text'] | |
logger.debug("Tokenize:[" + text + "]") | |
entities = tc(text) | |
#logger.debug("Classify:[" + entities + "]") | |
# TODO Add Text Classification for sentiment analysis | |
return {"text": text, "entities": entities} | |
# --- | |
# Gradio | |
## Interfaces | |
# https://www.gradio.app/main/docs/gradio/audio | |
input_audio = gr.Audio( | |
sources=["upload", "microphone"], | |
show_share_button=False | |
) | |
## App | |
gradio_app = gr.Interface( | |
transcribe, | |
inputs=[ | |
input_audio | |
], | |
outputs=[ | |
gr.HighlightedText() | |
], | |
title="ASRNERSBX", | |
description=( | |
"Transcribe, Tokenize, Classify" | |
), | |
flagging_mode="never" | |
) | |
## Start! | |
gradio_app.launch() |