File size: 4,712 Bytes
e06c27b
 
65e0b57
21425d1
6c022f9
65e0b57
e06c27b
 
 
 
 
179d87b
e06c27b
179d87b
e06c27b
 
 
179d87b
e06c27b
 
 
 
 
 
 
611b363
179d87b
 
 
6c022f9
179d87b
 
 
 
 
 
 
 
6c022f9
 
 
179d87b
6c022f9
179d87b
 
 
 
 
 
 
 
 
6c022f9
 
 
e06c27b
179d87b
 
 
 
 
 
 
 
 
e06c27b
179d87b
e06c27b
 
 
179d87b
 
e06c27b
 
 
 
 
 
179d87b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06c27b
 
 
 
 
 
 
 
 
 
 
 
 
179d87b
e06c27b
 
 
 
 
 
 
 
 
179d87b
e06c27b
179d87b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06c27b
 
179d87b
e06c27b
6c022f9
179d87b
 
 
 
6c022f9
e06c27b
179d87b
 
6c022f9
 
 
 
179d87b
 
6c022f9
 
179d87b
e06c27b
179d87b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os

import streamlit as st

from urllib.parse import urlparse, parse_qs

from stqdm import stqdm

# https://github.com/pytorch/pytorch/issues/77764
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled

from transformers import pipeline

import torch

# Setting device for PyTorch
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.has_mps:
    device = torch.device('mps')
else:
    device = torch.device('cpu')


class InvalidURLException(Exception):
    pass


def get_videoid_from_url(url: str):
    '''
    Gets video ID from give YouTube video URL

    :param url: YouTube video URL in 2 formats (standard and short)
    :return: id of YouTube video
    :raises InvalidURLException: If URL is not valid
    '''
    url_data = urlparse(url)
    query = parse_qs(url_data.query)

    if ('v' in query) & ('youtube.com' in url_data.netloc):
        video_id = query["v"][0]
    elif 'youtu.be' in url_data.netloc:
        path_lst = url.split('/')

        if path_lst:
            video_id = path_lst[-1]
        else:
            raise InvalidURLException('Invalid URL')
    else:
        raise InvalidURLException('Invalid URL')

    return video_id


def get_transcripts(url: str):
    '''
    Loads transcripts for given URL

    :param url: YouTube video URL
    :return: list, list of subtitles
    '''

    video_id = get_videoid_from_url(video_url_inp)

    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    try:
        transcript = transcript_list.find_manually_created_transcript(['en'])
    except NoTranscriptFound as e:
        st.info('No manual transcripts were found, trying to load generated ones...')
        transcript = transcript_list.find_generated_transcript(['en'])

    subtitles = transcript.fetch()

    subtitles = [sbt['text'] for sbt in subtitles if sbt['text'] != '[Music]']

    return subtitles


def generate_summary(subtitles: list):
    '''
    Creates summary based on subtitles of YouTube video.

    Uses T5-small model which shows best results for different topics
    of videos.

    :param subtitles: list of subtitles strings
    :return: summary based on subtitles
    '''
    subtitles_len = [len(sbt) for sbt in subtitles]
    sbt_mean_len = sum(subtitles_len) / len(subtitles_len)

    # Number of subtitles per step/summary
    # Since number length of transcripts differs
    # between generated and manual ones
    # we set different step size
    n_sbt_per_step = int(400 / (sbt_mean_len / 4))

    n_steps = len(subtitles) // n_sbt_per_step if len(subtitles) % n_sbt_per_step == 0 else \
        len(subtitles) // n_sbt_per_step + 1

    summaries = []

    for i in stqdm(range(n_steps)):
        sbt_txt = ' '.join(subtitles[n_sbt_per_step * i:n_sbt_per_step * (i + 1)])

        summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small',
                              max_length=512, truncation=True)

        summary = summarizer(sbt_txt, do_sample=False)
        summary = summary[0]['summary_text']

        summaries.append(summary)

    return ' '.join(summaries)


def process_click_callback():
    '''
    Callback for process button click
    '''
    global is_processing

    if is_processing:
        return
    else:
        is_processing = True

    global video_url_inp

    try:
        subtitles = get_transcripts(video_url_inp)
    except InvalidURLException as iue:
        is_processing = False
        st.error('Invalid YouTube URL, please provide URL in format that is shown on Examples')
        st.experimental_rerun()
    except TranscriptsDisabled as tde:
        is_processing = False
        st.error('Could not retrieve a transcript for given ID')
        st.experimental_rerun()

    summary = generate_summary(subtitles)

    st.session_state.summary_output = summary
    st.success('Processing complete!', icon="✅")

    is_processing = False


if __name__ == "__main__":
    # State of processing
    is_processing = False

    st.title('YouTube Video Summary 📃')
    st.markdown('Creates summary for given YouTube video URL based on transcripts.')
    st.code('https://www.youtube.com/watch?v=skl4OXNA12U')
    st.code('https://youtu.be/mEQc-iAbEBk')

    col1, col2 = st.columns(2)

    with col1:
        video_url_inp = st.text_input('YouTube Video URL:', placeholder='YouTube URL',
                                      label_visibility='collapsed')

    with col2:
        process_btn = st.button('🗜️Process', key='process_btn', on_click=process_click_callback)

    summary_out_txt = st.text_area(label='', key='summary_output', height=400)