dwb2023 commited on
Commit
68bab0c
1 Parent(s): 9d1f846

Update app.py

Browse files

that was a fine mess Ollie... much cleaner finally

Files changed (1) hide show
  1. app.py +63 -71
app.py CHANGED
@@ -1,31 +1,51 @@
1
- import gradio as gr
2
- import yt_dlp as youtube_dl
3
- from transformers import pipeline, BitsAndBytesConfig, WhisperForConditionalGeneration
4
- from transformers.pipelines.audio_utils import ffmpeg_read
5
- import torch
6
- from huggingface_hub import CommitScheduler
7
- import spaces
8
- import tempfile
9
  import os
10
  import json
 
11
  from datetime import datetime
12
  from pathlib import Path
13
  from uuid import uuid4
14
- from functools import lru_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
17
 
18
- MODEL_NAME = "dwb2023/whisper-large-v3-quantized"
19
  BATCH_SIZE = 8
20
  YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
21
 
22
- device = 0 if torch.cuda.is_available() else "cpu"
23
-
24
- # Load the model
25
- model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, device_map="auto")
 
 
 
 
 
26
 
27
  # bnb_config = bnb.QuantizationConfig(bits=4)
28
- pipe = pipeline(task="automatic-speech-recognition", model=model, chunk_length_s=30, device=device)
 
 
 
 
 
 
 
29
 
30
  # Define paths and create directory if not exists
31
  JSON_DATASET_DIR = Path("json_dataset")
@@ -40,22 +60,6 @@ scheduler = CommitScheduler(
40
  path_in_repo="data",
41
  )
42
 
43
- def _return_yt_html_embed(yt_url):
44
- video_id = yt_url.split("?v=")[-1]
45
- HTML_str = (
46
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
47
- " </center>"
48
- )
49
- return HTML_str
50
-
51
- @spaces.GPU
52
- @lru_cache(maxsize=10)
53
- def transcribe_audio(inputs, task):
54
- if inputs is None:
55
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
56
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
57
- return text
58
-
59
  def download_yt_audio(yt_url, filename):
60
  info_loader = youtube_dl.YoutubeDL()
61
  try:
@@ -66,13 +70,15 @@ def download_yt_audio(yt_url, filename):
66
  if file_length > YT_LENGTH_LIMIT_S:
67
  yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
68
  file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
69
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
 
 
70
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
71
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
72
  ydl.download([yt_url])
73
 
 
74
  @spaces.GPU
75
- @lru_cache(maxsize=10)
76
  def yt_transcribe(yt_url, task):
77
  with tempfile.TemporaryDirectory() as tmpdirname:
78
  filepath = os.path.join(tmpdirname, "video.mp4")
@@ -81,40 +87,40 @@ def yt_transcribe(yt_url, task):
81
  inputs = f.read()
82
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
83
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
84
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
 
 
 
 
85
  save_transcription(yt_url, text)
86
  return text
87
 
 
88
  def save_transcription(yt_url, transcription):
89
  with scheduler.lock:
90
  with JSON_DATASET_PATH.open("a") as f:
91
- json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
 
 
 
 
 
 
 
92
  f.write("\n")
93
 
94
- @spaces.GPU
95
- def yt_transcribe2(yt_url, task, max_filesize=75.0):
96
- html_embed_str = _return_yt_html_embed(yt_url)
97
-
98
- with tempfile.TemporaryDirectory() as tmpdirname:
99
- filepath = os.path.join(tmpdirname, "video.mp4")
100
- download_yt_audio(yt_url, filepath)
101
- with open(filepath, "rb") as f:
102
- inputs = f.read()
103
-
104
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
105
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
106
-
107
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
108
-
109
- return html_embed_str, text
110
-
111
  demo = gr.Blocks()
112
 
113
  yt_transcribe_interface = gr.Interface(
114
  fn=yt_transcribe,
115
  inputs=[
116
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
117
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
 
 
 
 
118
  ],
119
  outputs="text",
120
  title="Whisper Large V3: Transcribe YouTube",
@@ -126,23 +132,9 @@ yt_transcribe_interface = gr.Interface(
126
  allow_flagging="never",
127
  )
128
 
129
- yt_transcribe = gr.Interface(
130
- fn=yt_transcribe2,
131
- inputs=[
132
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
133
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
134
- ],
135
- outputs=["html", "text"],
136
- title="Whisper Large V3: Transcribe YouTube",
137
- description=(
138
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
139
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
140
- " arbitrary length."
141
- ),
142
- allow_flagging="never",
143
- )
144
-
145
  with demo:
146
- gr.TabbedInterface([yt_transcribe_interface, yt_transcribe], ["YouTube", "YouTube HF"])
 
 
147
 
148
  demo.queue().launch()
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
+ import time
4
  from datetime import datetime
5
  from pathlib import Path
6
  from uuid import uuid4
7
+ import tempfile
8
+
9
+ import gradio as gr
10
+ import yt_dlp as youtube_dl
11
+ from huggingface_hub import CommitScheduler
12
+ from transformers import (
13
+ BitsAndBytesConfig,
14
+ AutoModelForSpeechSeq2Seq,
15
+ AutoTokenizer,
16
+ AutoFeatureExtractor,
17
+ pipeline,
18
+ )
19
+ from transformers.pipelines.audio_utils import ffmpeg_read
20
+
21
+ # import torch # If you're using PyTorch
22
+ import spaces
23
 
24
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
25
 
26
+ MODEL_NAME = "openai/whisper-large-v3"
27
  BATCH_SIZE = 8
28
  YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
29
 
30
+ # Quantization
31
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True)
32
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
33
+ MODEL_NAME,
34
+ quantization_config=bnb_config,
35
+ device_map="auto"
36
+ )
37
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
39
 
40
  # bnb_config = bnb.QuantizationConfig(bits=4)
41
+ pipe = pipeline(
42
+ task="automatic-speech-recognition",
43
+ model=model,
44
+ tokenizer=tokenizer,
45
+ feature_extractor=feature_extractor,
46
+ chunk_length_s=30,
47
+ # device=device,
48
+ )
49
 
50
  # Define paths and create directory if not exists
51
  JSON_DATASET_DIR = Path("json_dataset")
 
60
  path_in_repo="data",
61
  )
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def download_yt_audio(yt_url, filename):
64
  info_loader = youtube_dl.YoutubeDL()
65
  try:
 
70
  if file_length > YT_LENGTH_LIMIT_S:
71
  yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
72
  file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
73
+ raise gr.Error(
74
+ f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
75
+ )
76
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
77
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
78
  ydl.download([yt_url])
79
 
80
+
81
  @spaces.GPU
 
82
  def yt_transcribe(yt_url, task):
83
  with tempfile.TemporaryDirectory() as tmpdirname:
84
  filepath = os.path.join(tmpdirname, "video.mp4")
 
87
  inputs = f.read()
88
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
89
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
90
+ text = pipe(
91
+ inputs,
92
+ batch_size=BATCH_SIZE,
93
+ generate_kwargs={"task": task},
94
+ return_timestamps=True,
95
+ )["text"]
96
  save_transcription(yt_url, text)
97
  return text
98
 
99
+
100
  def save_transcription(yt_url, transcription):
101
  with scheduler.lock:
102
  with JSON_DATASET_PATH.open("a") as f:
103
+ json.dump(
104
+ {
105
+ "url": yt_url,
106
+ "transcription": transcription,
107
+ "datetime": datetime.now().isoformat(),
108
+ },
109
+ f,
110
+ )
111
  f.write("\n")
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  demo = gr.Blocks()
114
 
115
  yt_transcribe_interface = gr.Interface(
116
  fn=yt_transcribe,
117
  inputs=[
118
+ gr.Textbox(
119
+ lines=1,
120
+ placeholder="Paste the URL to a YouTube video here",
121
+ label="YouTube URL",
122
+ ),
123
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
124
  ],
125
  outputs="text",
126
  title="Whisper Large V3: Transcribe YouTube",
 
132
  allow_flagging="never",
133
  )
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  with demo:
136
+ gr.TabbedInterface(
137
+ [yt_transcribe_interface], ["YouTube"]
138
+ )
139
 
140
  demo.queue().launch()