File size: 7,243 Bytes
88183ad
7bbd83c
68bab0c
7bbd83c
 
68bab0c
416dca9
68bab0c
 
 
 
 
 
 
 
 
 
2de41dc
416dca9
751197e
069559b
a11fbef
416dca9
2de41dc
 
 
416dca9
4c90570
6c226f9
416dca9
 
b3ee19f
4c90570
f4720e3
2de41dc
68bab0c
 
4c90570
 
 
68bab0c
6c226f9
416dca9
2de41dc
 
 
 
 
 
416dca9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c226f9
d790c0b
2de41dc
 
 
 
 
 
 
 
 
 
d790c0b
 
 
 
 
416dca9
554c0b5
 
 
 
68bab0c
 
 
416dca9
7bbd83c
d790c0b
7bbd83c
416dca9
66efbc3
751197e
7bbd83c
2de41dc
 
 
 
 
 
 
 
 
 
416dca9
 
 
2de41dc
d790c0b
 
416dca9
d790c0b
416dca9
 
 
 
 
 
 
 
 
2de41dc
416dca9
 
2de41dc
 
 
 
 
 
 
 
 
416dca9
 
 
2de41dc
 
 
 
 
416dca9
 
 
 
 
 
 
 
6c226f9
2de41dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c226f9
 
2de41dc
7bbd83c
6c226f9
7097513
68bab0c
 
 
 
 
 
7097513
7bbd83c
2de41dc
6c226f9
2de41dc
 
6c226f9
 
410acce
6c226f9
2de41dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c226f9
68bab0c
2de41dc
 
68bab0c
6c226f9
2de41dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
import json
import time
from datetime import datetime
from pathlib import Path
import tempfile
import pandas as pd

import gradio as gr
import yt_dlp as youtube_dl
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoTokenizer,
    AutoFeatureExtractor,
    pipeline,
)
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
from datasets import load_dataset, Dataset, DatasetDict
import spaces


# Constants
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8  # Optimized for better GPU utilization
YT_LENGTH_LIMIT_S = 10800  # 3 hours
DATASET_NAME = "dwb2023/yt-transcripts-v3"
FILE_LIMIT_MB = 1000

# Environment setup
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

device = 0 if torch.cuda.is_available() else "cpu"

# Pipeline setup
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

def reset_and_update_dataset(new_data):
    """
    Resets and updates the dataset with new transcription data.

    Args:
        new_data (dict): Dictionary containing the new data to be added to the dataset.
    """
    schema = {
        "url": pd.Series(dtype="str"),
        "transcription": pd.Series(dtype="str"),
        "title": pd.Series(dtype="str"),
        "duration": pd.Series(dtype="int"),
        "uploader": pd.Series(dtype="str"),
        "upload_date": pd.Series(dtype="datetime64[ns]"),
        "description": pd.Series(dtype="str"),
        "datetime": pd.Series(dtype="datetime64[ns]")
    }
    df = pd.DataFrame(schema)
    df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
    updated_dataset = Dataset.from_pandas(df)
    dataset_dict = DatasetDict({"train": updated_dataset})
    dataset_dict.push_to_hub(DATASET_NAME)
    print("Dataset reset and updated successfully!")

def download_yt_audio(yt_url, filename):
    """
    Downloads audio from a YouTube video using yt_dlp.

    Args:
        yt_url (str): URL of the YouTube video.
        filename (str): Path to save the downloaded audio.

    Returns:
        dict: Information about the YouTube video.
    """
    info_loader = youtube_dl.YoutubeDL()
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))

    file_length = info["duration"]
    if file_length > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
        raise gr.Error(
            f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
        )

    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])
    return info

@spaces.GPU(duration=120)
def yt_transcribe(yt_url, task):
    """
    Transcribes a YouTube video and saves the transcription if it doesn't already exist.

    Args:
        yt_url (str): URL of the YouTube video.
        task (str): Task to perform - "transcribe" or "translate".

    Returns:
        str: The transcription of the video.
    """
    dataset = load_dataset(DATASET_NAME, split="train")
    for row in dataset:
        if row['url'] == yt_url:
            return row['transcription']
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        info = download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            video_data = f.read()
        inputs = ffmpeg_read(video_data, pipe.feature_extractor.sampling_rate)
        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
        text = pipe(
            inputs,
            batch_size=BATCH_SIZE,
            generate_kwargs={"task": task},
            return_timestamps=True,
        )["text"]
        save_transcription(yt_url, text, info)
        return text

def save_transcription(yt_url, transcription, info):
    """
    Saves the transcription data to the dataset.

    Args:
        yt_url (str): URL of the YouTube video.
        transcription (str): The transcribed text.
        info (dict): Additional information about the video.
    """
    data = {
        "url": yt_url,
        "transcription": transcription,
        "title": info.get("title", "N/A"),
        "duration": info.get("duration", 0),
        "uploader": info.get("uploader", "N/A"),
        "upload_date": info.get("upload_date", "N/A"),
        "description": info.get("description", "N/A"),
        "datetime": datetime.now().isoformat()
    }
    dataset = load_dataset(DATASET_NAME, split="train")
    df = dataset.to_pandas()
    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
    updated_dataset = Dataset.from_pandas(df)
    dataset_dict = DatasetDict({"train": updated_dataset})
    dataset_dict.push_to_hub(DATASET_NAME)

@spaces.GPU
def transcribe(inputs, task):
    """
    Transcribes an audio input.

    Args:
        inputs (str): Path to the audio file.
        task (str): Task to perform - "transcribe" or "translate".

    Returns:
        str: The transcription of the audio.
    """
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

# Gradio App Setup
demo = gr.Blocks()

# YouTube Transcribe Tab
yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(
            lines=1,
            placeholder="Paste the URL to a YouTube video here",
            label="YouTube URL",
        ),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="YouTube Transcription",
    description=(
        f"Transcribe and archive YouTube videos using the {MODEL_NAME} model. "
        "The transcriptions are saved for future reference, so repeated requests are faster!"
    ),
    allow_flagging="never",
)

# Microphone Transcribe Tab
mf_transcribe_interface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Microphone Transcription",
    description="Transcribe audio captured through your microphone.",
    allow_flagging="never",
)

# File Upload Transcribe Tab
file_transcribe_interface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Audio File Transcription",
    description="Transcribe uploaded audio files of arbitrary length.",
    allow_flagging="never",
)

# Organize Tabs in the Gradio App
with demo:
    gr.TabbedInterface(
        [yt_transcribe_interface, mf_transcribe_interface, file_transcribe_interface],
        ["YouTube", "Microphone", "Audio File"]
    )

demo.queue().launch()