Spaces:
Runtime error
Runtime error
File size: 2,350 Bytes
52f8ce8 5d656c3 9cdc5fa 5d656c3 52f8ce8 f46c80d da90409 52f8ce8 f46c80d 9cdc5fa f46c80d 52f8ce8 9cdc5fa 52f8ce8 5d656c3 f115402 5d656c3 9cdc5fa 5d656c3 52f8ce8 9cdc5fa 5d656c3 52f8ce8 5d656c3 52f8ce8 48b0c57 9cdc5fa 5d656c3 52f8ce8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import pathlib
from faster_whisper import WhisperModel
import yt_dlp
import uuid
import os
import gradio as gr
# List of all supported video sites here https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md
def download_convert_video_to_audio(
yt_dlp,
video_url: str,
destination_path: pathlib.Path,
) -> None:
ydl_opts = {
"format": "bestaudio/best",
"postprocessors": [
{ # Extract audio using ffmpeg
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
}
],
"outtmpl": f"{destination_path}.%(ext)s",
}
try:
print(f"Downloading video from {video_url}")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download(video_url)
print(f"Downloaded video from {video_url} to {destination_path}")
except Exception as e:
raise (e)
def segment_to_dict(segment):
segment = segment._asdict()
if segment["words"] is not None:
segment["words"] = [word._asdict() for word in segment["words"]]
return segment
def download_video(video_url: str):
download_convert_video_to_audio(yt_dlp, video_url, f"/content/{uuid.uuid4().hex}")
def transcribe_video(video_url: str, beam_size: int = 5, model_size: str = "tiny", word_timestamps: bool = True):
print("loading model")
model = WhisperModel(model_size, device="cpu", compute_type="int8")
print("getting hex")
rand_id = uuid.uuid4().hex
print("doing download")
download_convert_video_to_audio(yt_dlp, video_url, f"/content/{rand_id}")
segments, info = model.transcribe(f"/content/{rand_id}.mp3", beam_size=beam_size, word_timestamps=word_timestamps)
segments = [segment_to_dict(segment) for segment in segments]
total_duration = round(info.duration, 2) # Same precision as the Whisper timestamps.
print(info)
os.remove(f"/content/{rand_id}.mp3")
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
print(segments)
return segments
# print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
# for segment in segments:
# print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
demo = gr.Interface(fn=transcribe_video, inputs="text", outputs="text")
demo.launch() |