import gc import torch import yt_dlp as youtube_dl from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read import tempfile import os from time import monotonic MODEL_NAME = "openai/whisper-large-v3" BATCH_SIZE = 8 YT_LENGTH_LIMIT_S = 5400 # limit to 1.5 hour YouTube files device = 'cuda:1' if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, torch_dtype=torch.float16, chunk_length_s=30, device=device, generate_kwargs={"language": "english"} ) def download_yt_audio(yt_url, filename, time_limit_s=YT_LENGTH_LIMIT_S): info_loader = youtube_dl.YoutubeDL() try: info = info_loader.extract_info(yt_url, download=False) except youtube_dl.utils.DownloadError as err: raise ValueError(f"Error downloading video: {str(err)}") file_length = info["duration"] if file_length > time_limit_s: raise ValueError(f"Video is too long. Maximum allowed length is {time_limit_s // 3600} hour(s).") ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"} # Only download the best available audio format with youtube_dl.YoutubeDL(ydl_opts) as ydl: try: ydl.download([yt_url]) except youtube_dl.utils.ExtractorError as err: raise ValueError(f"Error extracting audio: {str(err)}") def transcribe(yt_url, time_limit_s=YT_LENGTH_LIMIT_S): with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "video.mp4") t0 = monotonic() download_yt_audio(yt_url, filepath, time_limit_s) t1 = monotonic() print(f"Downloaded video in {t1 - t0:.2f} seconds.") with open(filepath, "rb") as f: inputs = f.read() inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} t0 = monotonic() text = pipe(inputs, batch_size=BATCH_SIZE)["text"] t1 = monotonic() print(f"Transcribed video in {t1 - t0:.2f} seconds.") torch.cuda.empty_cache() gc.collect() return text