Spaces:
Sleeping
Sleeping
File size: 7,243 Bytes
88183ad 7bbd83c 68bab0c 7bbd83c 68bab0c 416dca9 68bab0c 2de41dc 416dca9 751197e 069559b a11fbef 416dca9 2de41dc 416dca9 4c90570 6c226f9 416dca9 b3ee19f 4c90570 f4720e3 2de41dc 68bab0c 4c90570 68bab0c 6c226f9 416dca9 2de41dc 416dca9 6c226f9 d790c0b 2de41dc d790c0b 416dca9 554c0b5 68bab0c 416dca9 7bbd83c d790c0b 7bbd83c 416dca9 66efbc3 751197e 7bbd83c 2de41dc 416dca9 2de41dc d790c0b 416dca9 d790c0b 416dca9 2de41dc 416dca9 2de41dc 416dca9 2de41dc 416dca9 6c226f9 2de41dc 6c226f9 2de41dc 7bbd83c 6c226f9 7097513 68bab0c 7097513 7bbd83c 2de41dc 6c226f9 2de41dc 6c226f9 410acce 6c226f9 2de41dc 6c226f9 68bab0c 2de41dc 68bab0c 6c226f9 2de41dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
import os
import json
import time
from datetime import datetime
from pathlib import Path
import tempfile
import pandas as pd
import gradio as gr
import yt_dlp as youtube_dl
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoTokenizer,
AutoFeatureExtractor,
pipeline,
)
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
from datasets import load_dataset, Dataset, DatasetDict
import spaces
# Constants
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8 # Optimized for better GPU utilization
YT_LENGTH_LIMIT_S = 10800 # 3 hours
DATASET_NAME = "dwb2023/yt-transcripts-v3"
FILE_LIMIT_MB = 1000
# Environment setup
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
device = 0 if torch.cuda.is_available() else "cpu"
# Pipeline setup
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def reset_and_update_dataset(new_data):
"""
Resets and updates the dataset with new transcription data.
Args:
new_data (dict): Dictionary containing the new data to be added to the dataset.
"""
schema = {
"url": pd.Series(dtype="str"),
"transcription": pd.Series(dtype="str"),
"title": pd.Series(dtype="str"),
"duration": pd.Series(dtype="int"),
"uploader": pd.Series(dtype="str"),
"upload_date": pd.Series(dtype="datetime64[ns]"),
"description": pd.Series(dtype="str"),
"datetime": pd.Series(dtype="datetime64[ns]")
}
df = pd.DataFrame(schema)
df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
updated_dataset = Dataset.from_pandas(df)
dataset_dict = DatasetDict({"train": updated_dataset})
dataset_dict.push_to_hub(DATASET_NAME)
print("Dataset reset and updated successfully!")
def download_yt_audio(yt_url, filename):
"""
Downloads audio from a YouTube video using yt_dlp.
Args:
yt_url (str): URL of the YouTube video.
filename (str): Path to save the downloaded audio.
Returns:
dict: Information about the YouTube video.
"""
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration"]
if file_length > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
raise gr.Error(
f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
)
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
return info
@spaces.GPU(duration=120)
def yt_transcribe(yt_url, task):
"""
Transcribes a YouTube video and saves the transcription if it doesn't already exist.
Args:
yt_url (str): URL of the YouTube video.
task (str): Task to perform - "transcribe" or "translate".
Returns:
str: The transcription of the video.
"""
dataset = load_dataset(DATASET_NAME, split="train")
for row in dataset:
if row['url'] == yt_url:
return row['transcription']
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
info = download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
video_data = f.read()
inputs = ffmpeg_read(video_data, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task},
return_timestamps=True,
)["text"]
save_transcription(yt_url, text, info)
return text
def save_transcription(yt_url, transcription, info):
"""
Saves the transcription data to the dataset.
Args:
yt_url (str): URL of the YouTube video.
transcription (str): The transcribed text.
info (dict): Additional information about the video.
"""
data = {
"url": yt_url,
"transcription": transcription,
"title": info.get("title", "N/A"),
"duration": info.get("duration", 0),
"uploader": info.get("uploader", "N/A"),
"upload_date": info.get("upload_date", "N/A"),
"description": info.get("description", "N/A"),
"datetime": datetime.now().isoformat()
}
dataset = load_dataset(DATASET_NAME, split="train")
df = dataset.to_pandas()
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
updated_dataset = Dataset.from_pandas(df)
dataset_dict = DatasetDict({"train": updated_dataset})
dataset_dict.push_to_hub(DATASET_NAME)
@spaces.GPU
def transcribe(inputs, task):
"""
Transcribes an audio input.
Args:
inputs (str): Path to the audio file.
task (str): Task to perform - "transcribe" or "translate".
Returns:
str: The transcription of the audio.
"""
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
# Gradio App Setup
demo = gr.Blocks()
# YouTube Transcribe Tab
yt_transcribe_interface = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(
lines=1,
placeholder="Paste the URL to a YouTube video here",
label="YouTube URL",
),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="YouTube Transcription",
description=(
f"Transcribe and archive YouTube videos using the {MODEL_NAME} model. "
"The transcriptions are saved for future reference, so repeated requests are faster!"
),
allow_flagging="never",
)
# Microphone Transcribe Tab
mf_transcribe_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Microphone Transcription",
description="Transcribe audio captured through your microphone.",
allow_flagging="never",
)
# File Upload Transcribe Tab
file_transcribe_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Audio File Transcription",
description="Transcribe uploaded audio files of arbitrary length.",
allow_flagging="never",
)
# Organize Tabs in the Gradio App
with demo:
gr.TabbedInterface(
[yt_transcribe_interface, mf_transcribe_interface, file_transcribe_interface],
["YouTube", "Microphone", "Audio File"]
)
demo.queue().launch() |