knowledge-scribe

Sleeping

App Files Files Community

dwb2023 commited on Jun 10, 2024

Commit

68bab0c

verified ·

1 Parent(s): 9d1f846

Update app.py

Browse files

that was a fine mess Ollie... much cleaner finally

Files changed (1) hide show

app.py +63 -71

app.py CHANGED Viewed

@@ -1,31 +1,51 @@
-import gradio as gr
-import yt_dlp as youtube_dl
-from transformers import pipeline, BitsAndBytesConfig, WhisperForConditionalGeneration
-from transformers.pipelines.audio_utils import ffmpeg_read
-import torch
-from huggingface_hub import CommitScheduler
-import spaces
-import tempfile
 import os
 import json
 from datetime import datetime
 from pathlib import Path
 from uuid import uuid4
-from functools import lru_cache
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-MODEL_NAME = "dwb2023/whisper-large-v3-quantized"
 BATCH_SIZE = 8
 YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
-device = 0 if torch.cuda.is_available() else "cpu"
-# Load the model
-model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, device_map="auto")
 # bnb_config = bnb.QuantizationConfig(bits=4)
-pipe = pipeline(task="automatic-speech-recognition", model=model, chunk_length_s=30, device=device)
 # Define paths and create directory if not exists
 JSON_DATASET_DIR = Path("json_dataset")
@@ -40,22 +60,6 @@ scheduler = CommitScheduler(
     path_in_repo="data",
 )
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
-    )
-    return HTML_str
-@spaces.GPU
-@lru_cache(maxsize=10)
-def transcribe_audio(inputs, task):
-    if inputs is None:
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return text
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
@@ -66,13 +70,15 @@ def download_yt_audio(yt_url, filename):
     if file_length > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
         file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
-        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         ydl.download([yt_url])
 @spaces.GPU
-@lru_cache(maxsize=10)
 def yt_transcribe(yt_url, task):
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
@@ -81,40 +87,40 @@ def yt_transcribe(yt_url, task):
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     save_transcription(yt_url, text)
     return text
 def save_transcription(yt_url, transcription):
     with scheduler.lock:
         with JSON_DATASET_PATH.open("a") as f:
-            json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
             f.write("\n")
-@spaces.GPU
-def yt_transcribe2(yt_url, task, max_filesize=75.0):
-    html_embed_str = _return_yt_html_embed(yt_url)
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        filepath = os.path.join(tmpdirname, "video.mp4")
-        download_yt_audio(yt_url, filepath)
-        with open(filepath, "rb") as f:
-            inputs = f.read()
-    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
-    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return html_embed_str, text
 demo = gr.Blocks()
 yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
     inputs=[
-        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
     ],
     outputs="text",
     title="Whisper Large V3: Transcribe YouTube",
@@ -126,23 +132,9 @@ yt_transcribe_interface = gr.Interface(
     allow_flagging="never",
 )
-yt_transcribe = gr.Interface(
-    fn=yt_transcribe2,
-    inputs=[
-        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
-    ],
-    outputs=["html", "text"],
-    title="Whisper Large V3: Transcribe YouTube",
-    description=(
-        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
-        " arbitrary length."
-    ),
-    allow_flagging="never",
-)
 with demo:
-    gr.TabbedInterface([yt_transcribe_interface, yt_transcribe], ["YouTube", "YouTube HF"])
 demo.queue().launch()

 import os
 import json
+import time
 from datetime import datetime
 from pathlib import Path
 from uuid import uuid4
+import tempfile
+import gradio as gr
+import yt_dlp as youtube_dl
+from huggingface_hub import CommitScheduler
+from transformers import (
+    BitsAndBytesConfig,
+    AutoModelForSpeechSeq2Seq,
+    AutoTokenizer,
+    AutoFeatureExtractor,
+    pipeline,
+)
+from transformers.pipelines.audio_utils import ffmpeg_read
+# import torch  # If you're using PyTorch
+import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
+# Quantization
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME,
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
 # bnb_config = bnb.QuantizationConfig(bits=4)
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=model,
+    tokenizer=tokenizer,
+    feature_extractor=feature_extractor,
+    chunk_length_s=30,
+    # device=device,
+)
 # Define paths and create directory if not exists
 JSON_DATASET_DIR = Path("json_dataset")
     path_in_repo="data",
 )
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
     if file_length > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
         file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
+        raise gr.Error(
+            f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
+        )
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         ydl.download([yt_url])
 @spaces.GPU
 def yt_transcribe(yt_url, task):
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    text = pipe(
+        inputs,
+        batch_size=BATCH_SIZE,
+        generate_kwargs={"task": task},
+        return_timestamps=True,
+    )["text"]
     save_transcription(yt_url, text)
     return text
 def save_transcription(yt_url, transcription):
     with scheduler.lock:
         with JSON_DATASET_PATH.open("a") as f:
+            json.dump(
+                {
+                    "url": yt_url,
+                    "transcription": transcription,
+                    "datetime": datetime.now().isoformat(),
+                },
+                f,
+            )
             f.write("\n")
 demo = gr.Blocks()
 yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
     inputs=[
+        gr.Textbox(
+            lines=1,
+            placeholder="Paste the URL to a YouTube video here",
+            label="YouTube URL",
+        ),
+        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs="text",
     title="Whisper Large V3: Transcribe YouTube",
     allow_flagging="never",
 )
 with demo:
+    gr.TabbedInterface(
+        [yt_transcribe_interface], ["YouTube"]
+    )
 demo.queue().launch()