Spaces:

MikeTangoEcho
/

asrnersbx

Paused

App Files Files Community

asrnersbx / app.py

MikeTangoEcho

fix: app.py

c092255 about 1 month ago

raw

history blame

2.88 kB

	import gradio as gr
	import numpy as np
	import torch
	import transformers
	from pathlib import Path
	from transformers import pipeline
	from transformers.utils import logging

	# Log

	#logging.set_verbosity_debug()
	logger = logging.get_logger("transformers")

	# Pipelines

	## Automatic Speech Recognition
	## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
	## Require ffmpeg to be installed
	asr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
	asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	asr_model = "openai/whisper-tiny"
	asr = pipeline(
	"automatic-speech-recognition",
	model=asr_model,
	torch_dtype=asr_torch_dtype,
	device=asr_device
	)

	## Token Classification / Name Entity Recognition
	## https://huggingface.co/docs/transformers/task_summary#token-classification
	tc_device = 0 if torch.cuda.is_available() else "cpu"
	tc_model = "dslim/distilbert-NER"
	tc = pipeline(
	"token-classification", # ner
	model=tc_model,
	device=tc_device
	)

	# ---

	# Transformers

	# https://www.gradio.app/main/docs/gradio/audio#behavior
	# As output component: expects audio data in any of these formats:
	# - a str or pathlib.Path filepath
	# - or URL to an audio file,
	# - or a bytes object (recommended for streaming),
	# - or a tuple of (sample rate in Hz, audio data as numpy array)
	def transcribe(audio: str \| Path \| bytes \| tuple[int, np.ndarray] \| None):
	if audio is None:
	return "..."
	# TODO Manage str/Path

	logger.debug("Transcribe")

	text = ""
	# https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
	# Whisper input format for tuple differ from output provided by gradio audio component
	if asr_model.startswith("openai/whisper"):
	sampling_rate, raw = audio

	# Convert to mono if stereo
	if raw.ndim > 1:
	raw = raw.mean(axis=1)

	raw = raw.astype(np.float32)
	raw /= np.max(np.abs(raw))

	inputs = {"sampling_rate": sampling_rate, "raw": raw} if type(audio) is tuple else audio
	transcript = asr(inputs)
	text = transcript['text']

	logger.debug("Tokenize:[" + text + "]")

	entities = tc(text)

	#logger.debug("Classify:[" + entities + "]")

	# TODO Add Text Classification for sentiment analysis
	return {"text": text, "entities": entities}

	# ---

	# Gradio

	## Interfaces

	# https://www.gradio.app/main/docs/gradio/audio
	input_audio = gr.Audio(
	sources=["upload", "microphone"],
	show_share_button=False
	)

	## App

	gradio_app = gr.Interface(
	transcribe,
	inputs=[
	input_audio
	],
	outputs=[
	gr.HighlightedText()
	],
	title="ASRNERSBX",
	description=(
	"Transcribe, Tokenize, Classify"
	),
	flagging_mode="never"
	)

	## Start!
	gradio_app.launch()