File size: 6,856 Bytes
3d79800
 
079430a
 
e21f6d3
a2632d3
079430a
 
 
3d79800
990677b
3d79800
 
079430a
990677b
079430a
 
 
 
 
3d79800
079430a
 
 
 
 
 
 
 
3d79800
 
 
 
 
 
079430a
 
 
990677b
079430a
3d79800
 
 
079430a
3d79800
 
 
 
079430a
 
 
3d79800
079430a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d79800
307a45b
3d79800
 
 
 
 
079430a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2632d3
e21f6d3
 
 
 
990677b
e21f6d3
 
a2632d3
e21f6d3
 
990677b
a2632d3
e21f6d3
 
 
a2632d3
079430a
e21f6d3
 
 
 
 
 
a2632d3
e21f6d3
 
079430a
e21f6d3
 
 
 
 
 
a2632d3
e21f6d3
 
 
 
 
 
 
 
a2632d3
e21f6d3
 
 
a2632d3
e21f6d3
 
 
a2632d3
e21f6d3
 
 
 
 
 
 
 
a2632d3
e21f6d3
 
 
 
a2632d3
e21f6d3
a2632d3
e21f6d3
 
a2632d3
 
079430a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

import spaces
import gradio as gr
import pandas as pd
import yt_dlp
import os
from semantic_chunkers import StatisticalChunker
from semantic_router.encoders import HuggingFaceEncoder
from faster_whisper import WhisperModel
import io

# Function to download YouTube audio and return it as a BytesIO object
def download_youtube_audio(url, preferred_quality="192"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': preferred_quality,
        }],
        'outtmpl': '-',  # Output to stdout
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=False)
            video_title = info_dict.get('title', None)
            print(f"Downloading audio for: {video_title}")

            # Download audio to a BytesIO object
            audio_buffer = io.BytesIO()
            ydl.download([url], audio_buffer)
            audio_buffer.seek(0)
            print("Audio download complete")
            return audio_buffer

    except yt_dlp.utils.DownloadError as e:
        print(f"Error downloading audio: {e}")
        return None

# Function to transcribe audio from BytesIO using WhisperModel
@spaces.GPU
def transcribe(audio_buffer, model_name="medium"):
    model = WhisperModel(model_name)
    print("Reading audio buffer")
    
    # Hypothetical support for BytesIO object
    segments, info = model.transcribe(audio_buffer)
    return segments

# Function to process segments and convert them into a DataFrame
@spaces.GPU
def process_segments(segments):
    result = {}
    print("Processing...")
    for i, segment in enumerate(segments):
        chunk_id = f"chunk_{i}"
        result[chunk_id] = {
            'chunk_id': segment.id,
            'chunk_length': segment.end - segment.start,
            'text': segment.text,
            'start_time': segment.start,
            'end_time': segment.end
        }
    df = pd.DataFrame.from_dict(result, orient='index')
    df.to_csv('final.csv')  # Save DataFrame to final.csv
    return df

# Gradio interface functions
@spaces.GPU
def generate_transcript(youtube_url, model_name="large-v3"):
    audio_buffer = download_youtube_audio(youtube_url)
    if audio_buffer is None:
        return "Error downloading audio"
    
    segments = transcribe(audio_buffer, model_name)
    df = process_segments(segments)
    
    lis = list(df['text'])
    encoder = HuggingFaceEncoder(name="sentence-transformers/all-MiniLM-L6-v2")
    chunker = StatisticalChunker(encoder=encoder, dynamic_threshold=True, min_split_tokens=30, max_split_tokens=40, window_size=2, enable_statistics=False)
    chunks = chunker._chunk(lis)
    
    row_index = 0
    for i in range(len(chunks)):
        for j in range(len(chunks[i].splits)):
            df.at[row_index, 'chunk_id2'] = f'chunk_{i}'
            row_index += 1
    
    grouped = df.groupby('chunk_id2').agg({
        'start_time': 'min',
        'end_time': 'max',
        'text': lambda x: ' '.join(x),
        'chunk_id': list
    }).reset_index()
    
    grouped = grouped.rename(columns={'chunk_id': 'chunk_ids'})
    grouped['chunk_length'] = grouped['end_time'] - grouped['start_time']
    grouped['chunk_id'] = grouped['chunk_id2']
    grouped = grouped.drop(columns=['chunk_id2', 'chunk_ids'])
    grouped.to_csv('final.csv')
    df = pd.read_csv("final.csv")
    transcripts = df.to_dict(orient='records')
    
    return transcripts

# Function to download video using yt-dlp and generate transcript HTML
def download_video(youtube_url):
    ydl_opts = {
        'format': 'mp4',
        'outtmpl': 'downloaded_video.mp4',
        'quiet': True
    }

    with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
        info_dict = ydl.extract_info(youtube_url, download=False)
        video_path = 'downloaded_video.mp4'

    if not os.path.exists(video_path):
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])

    transcripts = generate_transcript(youtube_url)
    transcript_html = ""
    for t in transcripts:
        transcript_html += f'<div class="transcript-block"><a href="#" onclick="var video = document.getElementById(\'video-player\').querySelector(\'video\'); video.currentTime={t["start_time"]}; return false;">' \
                           f'[{t["start_time"]:.2f} - {t["end_time"]:.2f}]<br>{t["text"]}</a></div>'
    
    return video_path, transcript_html

# Function to search the transcript
def search_transcript(keyword):
    transcripts = pd.read_csv("final.csv").to_dict(orient='records')
    search_results = ""
    for t in transcripts:
        if keyword.lower() in t['text'].lower():
            search_results += f'<div class="transcript-block"><a href="#" onclick="var video = document.getElementById(\'video-player\').querySelector(\'video\'); video.currentTime={t["start_time"]}; return false;">' \
                              f'[{t["start_time"]:.2f} - {t["end_time"]:.2f}]<br>{t["text"]}</a></div>'
    return search_results

# CSS for styling
css = """
.fixed-video { width: 480px !important; height: 270px !important; }
.fixed-transcript { width: 480px !important; height: 270px !important; overflow-y: auto; }
.transcript-block { margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9; }
.transcript-block a { text-decoration: none; color: #007bff; }
.transcript-block a:hover { text-decoration: underline; }
"""

# Gradio interface
with gr.Blocks(css=css) as demo:
    gr.Markdown("# YouTube Video Player with Clickable Transcript")

    with gr.Row():
        youtube_url = gr.Textbox(label="YouTube URL", placeholder="Enter YouTube video link here")
        download_button = gr.Button("Download and Display Transcript")
    
    with gr.Row():
        video = gr.Video(label="Video Player", elem_id="video-player", elem_classes="fixed-video")
        transcript_display = gr.HTML(label="Transcript", elem_classes="fixed-transcript")

    with gr.Row():
        search_box = gr.Textbox(label="Search Transcript", placeholder="Enter keyword to search")
        search_button = gr.Button("Search")
        search_results_display = gr.HTML(label="Search Results", elem_classes="fixed-transcript")

    # On button click, download the video and display the transcript
    def display_transcript(youtube_url):
        video_path, transcript_html = download_video(youtube_url)
        return video_path, transcript_html

    download_button.click(fn=display_transcript, inputs=youtube_url, outputs=[video, transcript_display])

    # On search button click, search the transcript and display results
    search_button.click(fn=search_transcript, inputs=search_box, outputs=search_results_display)

# Launch the interface
demo.launch()