knowledge-scribe

Sleeping

App Files Files Community

dwb2023 commited on Oct 4, 2024

Commit

2de41dc

verified ·

1 Parent(s): a589aff

update app

Browse files

Files changed (1) hide show

app.py +109 -68

app.py CHANGED Viewed

@@ -9,44 +9,46 @@ import pandas as pd
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import (
-    BitsAndBytesConfig,
     AutoModelForSpeechSeq2Seq,
     AutoTokenizer,
     AutoFeatureExtractor,
     pipeline,
 )
 from transformers.pipelines.audio_utils import ffmpeg_read
-import torch  # If you're using PyTorch
 from datasets import load_dataset, Dataset, DatasetDict
 import spaces
 # Constants
-MODEL_NAME = "openai/whisper-large-v3"
-BATCH_SIZE = 8
-YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
 DATASET_NAME = "dwb2023/yt-transcripts-v3"
 # Environment setup
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Model setup
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
-    quantization_config=bnb_config,
     use_cache=False,
     device_map="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
@@ -56,7 +58,12 @@ pipe = pipeline(
 )
 def reset_and_update_dataset(new_data):
-    # Define the schema for an empty DataFrame
     schema = {
         "url": pd.Series(dtype="str"),
         "transcription": pd.Series(dtype="str"),
@@ -67,22 +74,24 @@ def reset_and_update_dataset(new_data):
         "description": pd.Series(dtype="str"),
         "datetime": pd.Series(dtype="datetime64[ns]")
     }
-    # Create an empty DataFrame with the defined schema
     df = pd.DataFrame(schema)
-    # Append the new data
     df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
-    # Convert back to dataset
     updated_dataset = Dataset.from_pandas(df)
-    # Push the updated dataset to the hub
     dataset_dict = DatasetDict({"train": updated_dataset})
     dataset_dict.push_to_hub(DATASET_NAME)
     print("Dataset reset and updated successfully!")
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
         info = info_loader.extract_info(yt_url, download=False)
@@ -104,15 +113,20 @@ def download_yt_audio(yt_url, filename):
 @spaces.GPU(duration=120)
 def yt_transcribe(yt_url, task):
-    # Load the dataset
     dataset = load_dataset(DATASET_NAME, split="train")
-    # Check if the transcription already exists
     for row in dataset:
         if row['url'] == yt_url:
-            return row['transcription']  # Return the existing transcription
-    # If transcription does not exist, perform the transcription
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
         info = download_yt_audio(yt_url, filepath)
@@ -126,54 +140,56 @@ def yt_transcribe(yt_url, task):
             generate_kwargs={"task": task},
             return_timestamps=True,
         )["text"]
-        # Extract additional fields
-        try:
-            title = info.get("title", "N/A")
-            duration = info.get("duration", 0)
-            uploader = info.get("uploader", "N/A")
-            upload_date = info.get("upload_date", "N/A")
-            description = info.get("description", "N/A")
-        except KeyError:
-            title = "N/A"
-            duration = 0
-            uploader = "N/A"
-            upload_date = "N/A"
-            description = "N/A"
-        save_transcription(yt_url, text, title, duration, uploader, upload_date, description)
         return text
-def save_transcription(yt_url, transcription, title, duration, uploader, upload_date, description):
     data = {
         "url": yt_url,
         "transcription": transcription,
-        "title": title,
-        "duration": duration,
-        "uploader": uploader,
-        "upload_date": upload_date,
-        "description": description,
         "datetime": datetime.now().isoformat()
     }
-    # Load the existing dataset
     dataset = load_dataset(DATASET_NAME, split="train")
-    # Convert to pandas dataframe
     df = dataset.to_pandas()
-    # Append the new data
     df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
-    # Convert back to dataset
     updated_dataset = Dataset.from_pandas(df)
-    # Push the updated dataset to the hub
     dataset_dict = DatasetDict({"train": updated_dataset})
     dataset_dict.push_to_hub(DATASET_NAME)
 demo = gr.Blocks()
 yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
     inputs=[
@@ -185,20 +201,45 @@ yt_transcribe_interface = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs="text",
-    title="👂👁️👅👃✋ KnowledgeScribe 📝 🧠💡🎓🚀",
     description=(
-        f"""**KnowledgeScribe** is your all-in-one transcription and summarization tool designed to help your LLM extract and distill knowledge from various sources, including YouTube videos and Arxiv papers.
-        \n\nCurrently leverages the following datasets and models:
-        \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset
-        \n- [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) model
-        """
     ),
     allow_flagging="never",
 )
 with demo:
     gr.TabbedInterface(
-        [yt_transcribe_interface], ["YouTube"]
     )
-demo.queue().launch()

 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import (
     AutoModelForSpeechSeq2Seq,
     AutoTokenizer,
     AutoFeatureExtractor,
     pipeline,
 )
 from transformers.pipelines.audio_utils import ffmpeg_read
+import torch
 from datasets import load_dataset, Dataset, DatasetDict
 import spaces
 # Constants
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+BATCH_SIZE = 8  # Optimized for better GPU utilization
+YT_LENGTH_LIMIT_S = 10800  # 3 hours
 DATASET_NAME = "dwb2023/yt-transcripts-v3"
 # Environment setup
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Model setup
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
     use_cache=False,
     device_map="auto"
 )
+# Flash Attention setup for memory and speed optimization if supported
+try:
+    from flash_attn import flash_attn_fn
+    model.config.use_flash_attention = True
+except ImportError:
+    print("Flash Attention is not available. Proceeding without it.")
+# Note: torch.compile is not compatible with Flash Attention or the chunked long-form algorithm.
+# Processor setup
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
+# Pipeline setup
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
 )
 def reset_and_update_dataset(new_data):
+    """
+    Resets and updates the dataset with new transcription data.
+    Args:
+        new_data (dict): Dictionary containing the new data to be added to the dataset.
+    """
     schema = {
         "url": pd.Series(dtype="str"),
         "transcription": pd.Series(dtype="str"),
         "description": pd.Series(dtype="str"),
         "datetime": pd.Series(dtype="datetime64[ns]")
     }
     df = pd.DataFrame(schema)
     df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
     updated_dataset = Dataset.from_pandas(df)
     dataset_dict = DatasetDict({"train": updated_dataset})
     dataset_dict.push_to_hub(DATASET_NAME)
     print("Dataset reset and updated successfully!")
 def download_yt_audio(yt_url, filename):
+    """
+    Downloads audio from a YouTube video using yt_dlp.
+    Args:
+        yt_url (str): URL of the YouTube video.
+        filename (str): Path to save the downloaded audio.
+    Returns:
+        dict: Information about the YouTube video.
+    """
     info_loader = youtube_dl.YoutubeDL()
     try:
         info = info_loader.extract_info(yt_url, download=False)
 @spaces.GPU(duration=120)
 def yt_transcribe(yt_url, task):
+    """
+    Transcribes a YouTube video and saves the transcription if it doesn't already exist.
+    Args:
+        yt_url (str): URL of the YouTube video.
+        task (str): Task to perform - "transcribe" or "translate".
+    Returns:
+        str: The transcription of the video.
+    """
     dataset = load_dataset(DATASET_NAME, split="train")
     for row in dataset:
         if row['url'] == yt_url:
+            return row['transcription']
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
         info = download_yt_audio(yt_url, filepath)
             generate_kwargs={"task": task},
             return_timestamps=True,
         )["text"]
+        save_transcription(yt_url, text, info)
         return text
+def save_transcription(yt_url, transcription, info):
+    """
+    Saves the transcription data to the dataset.
+    Args:
+        yt_url (str): URL of the YouTube video.
+        transcription (str): The transcribed text.
+        info (dict): Additional information about the video.
+    """
     data = {
         "url": yt_url,
         "transcription": transcription,
+        "title": info.get("title", "N/A"),
+        "duration": info.get("duration", 0),
+        "uploader": info.get("uploader", "N/A"),
+        "upload_date": info.get("upload_date", "N/A"),
+        "description": info.get("description", "N/A"),
         "datetime": datetime.now().isoformat()
     }
     dataset = load_dataset(DATASET_NAME, split="train")
     df = dataset.to_pandas()
     df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
     updated_dataset = Dataset.from_pandas(df)
     dataset_dict = DatasetDict({"train": updated_dataset})
     dataset_dict.push_to_hub(DATASET_NAME)
+@spaces.GPU
+def transcribe(inputs, task):
+    """
+    Transcribes an audio input.
+    Args:
+        inputs (str): Path to the audio file.
+        task (str): Task to perform - "transcribe" or "translate".
+    Returns:
+        str: The transcription of the audio.
+    """
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+    return text
+# Gradio App Setup
 demo = gr.Blocks()
+# YouTube Transcribe Tab
 yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
     inputs=[
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs="text",
+    title="YouTube Transcription",
     description=(
+        f"Transcribe and archive YouTube videos using the {MODEL_NAME} model. "
+        "The transcriptions are saved for future reference, so repeated requests are faster!"
     ),
     allow_flagging="never",
 )
+# Microphone Transcribe Tab
+mf_transcribe_interface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.Audio(sources="microphone", type="filepath"),
+        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+    ],
+    outputs="text",
+    title="Microphone Transcription",
+    description="Transcribe audio captured through your microphone.",
+    allow_flagging="never",
+)
+# File Upload Transcribe Tab
+file_transcribe_interface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.Audio(sources="upload", type="filepath", label="Audio file"),
+        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+    ],
+    outputs="text",
+    title="Audio File Transcription",
+    description="Transcribe uploaded audio files of arbitrary length.",
+    allow_flagging="never",
+)
+# Organize Tabs in the Gradio App
 with demo:
     gr.TabbedInterface(
+        [yt_transcribe_interface, mf_transcribe_interface, file_transcribe_interface],
+        ["YouTube", "Microphone", "Audio File"]
     )
+demo.queue().launch()