nb-whisper-demo

Running on T4

App Files Files Community

nb-whisper-demo / app.py

AngelinaZanardi

Update app.py

ebf43e3 verified about 2 months ago

raw

history blame

5.1 kB

	import time
	import os
	import re

	import torch

	import gradio as gr
	import spaces
	from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
	from huggingface_hub import model_info
	try:
	import flash_attn
	FLASH_ATTENTION = True
	except ImportError:
	FLASH_ATTENTION = False

	import yt_dlp # Added import for yt-dlp

	MODEL_NAME = "NbAiLab/nb-whisper-large"
	lang = "no"
	logo_path = "/home/angelina/Nedlastinger/Screenshot 2024-10-10 at 13-30-13 Nasjonalbiblioteket — Melkeveien designkontor.png"

	share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
	auth_token = os.environ.get("AUTH_TOKEN") or True
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print(f"Bruker enhet: {device}")

	@spaces.GPU(duration=60 * 2)
	def pipe(file, return_timestamps=False):
	asr = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=28,
	device=device,
	token=auth_token,
	torch_dtype=torch.float16,
	model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
	)
	asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
	language=lang,
	task="transcribe",
	no_timestamps=not return_timestamps,
	)
	return asr(file, return_timestamps=return_timestamps, batch_size=24)

	def format_output(text):
	# Add a newline after ".", "!", ":", or "?" unless part of sequences like "..."
	text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '\n', text)
	# Ensure newline after sequences like "..." or other punctuation patterns
	text = re.sub(r'(\.{3,}\|[.!:?])', lambda m: m.group() + '\n\n', text)
	return text

	def transcribe(file, return_timestamps=False):
	if not return_timestamps:
	text = pipe(file)["text"]
	formatted_text = format_output(text)
	else:
	chunks = pipe(file, return_timestamps=True)["chunks"]
	text = []
	for chunk in chunks:
	start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
	end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
	line = f"[{start_time} -> {end_time}] {chunk['text']}"
	text.append(line)
	formatted_text = "\n".join(text)
	return formatted_text

	def _return_yt_html_embed(yt_url):
	video_id = yt_url.split("?v=")[-1]
	HTML_str = (
	f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
	" </center>"
	)
	return HTML_str

	def yt_transcribe(yt_url, return_timestamps=False):
	html_embed_str = _return_yt_html_embed(yt_url)

	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': 'audio.%(ext)s',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'quiet': True,
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([yt_url])

	text = transcribe("audio.mp3", return_timestamps=return_timestamps)

	return html_embed_str, text

	# Lag Gradio-appen uten faner

	demo = gr.Blocks()

	with demo:
	gr.Image(value=logo_path, label="Nasjonalbibliotek Logo", elem_id="logo")
	mf_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
	gr.components.Checkbox(label="Inkluder tidsstempler"),
	],
	outputs="text",
	title="NB-Whisper",
	description=(
	"Transkriber lange lydopptak fra mikrofon eller lydfiler med et enkelt klikk! Demoen bruker den fintunede"
	f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler opp til 30 minutter."
	),
	allow_flagging="never",
	show_submit_button=False,
	)

	# Uncomment to add the YouTube transcription interface if needed
	# yt_transcribe_interface = gr.Interface(
	# fn=yt_transcribe,
	# inputs=[
	# gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
	# gr.components.Checkbox(label="Inkluder tidsstempler"),
	# ],
	# examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
	# outputs=["html", "text"],
	# title="Whisper Demo: Transkriber YouTube",
	# description=(
	# "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
	# f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
	# " vilkårlig lengde."
	# ),
	# allow_flagging="never",
	# )

	# Start demoen uten faner
	demo.launch(share=share).queue()