alankabisov commited on
Commit
179d87b
β€’
1 Parent(s): 9292468

added ui and refactoring

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. .streamlit/config.toml +4 -0
  3. app.py +92 -49
  4. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea/
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [theme]
2
+ backgroundColor="#d2e7ee"
3
+ secondaryBackgroundColor="#79c1ee"
4
+ textColor="#151516"
app.py CHANGED
@@ -1,22 +1,20 @@
1
  import os
2
 
3
-
4
  import streamlit as st
5
  from urllib.parse import urlparse, parse_qs
6
 
7
- from tqdm import tqdm
8
  from stqdm import stqdm
9
 
10
  # https://github.com/pytorch/pytorch/issues/77764
11
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
12
 
13
- from youtube_transcript_api import YouTubeTranscriptApi
14
 
15
- from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
16
 
17
  import torch
18
 
19
- # Setting device for PYTorch
20
  if torch.cuda.is_available():
21
  device = torch.device('cuda')
22
  elif torch.has_mps:
@@ -25,47 +23,79 @@ else:
25
  device = torch.device('cpu')
26
 
27
 
 
 
 
28
 
29
- def get_videoid_from_url(url:str):
 
 
 
 
 
 
 
30
  url_data = urlparse(url)
31
  query = parse_qs(url_data.query)
32
 
33
- try:
34
  video_id = query["v"][0]
35
- except KeyError:
36
- video_id = ''
 
 
 
 
 
 
 
37
 
38
  return video_id
39
 
40
- def process_click_callback():
41
- st.session_state.process_btn = True
42
 
43
- print('Using {} device'.format(device))
 
 
 
 
 
 
 
 
44
 
45
- transcript_list = YouTubeTranscriptApi.list_transcripts('aircAruvnKk') # 3blue1Brown
46
 
47
  try:
48
  transcript = transcript_list.find_manually_created_transcript(['en'])
49
- except Exception as e:
50
- print('No manual transcripts were found, trying to load generated ones...')
51
  transcript = transcript_list.find_generated_transcript(['en'])
52
 
53
  subtitles = transcript.fetch()
54
 
55
  subtitles = [sbt['text'] for sbt in subtitles if sbt['text'] != '[Music]']
56
- subtitles_len = [len(sbt) for sbt in subtitles]
57
- sbt_mean_len = sum(subtitles_len)/len(subtitles_len)
58
 
59
- print('Mean length of subtitles: {}'.format(sbt_mean_len))
60
- print(subtitles)
61
- print(len(subtitles))
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Number of subtitles per step/summary
64
  # Since number length of transcripts differs
65
  # between generated and manual ones
66
  # we set different step size
67
  n_sbt_per_step = int(400 / (sbt_mean_len / 4))
68
- print('Number subtitles per summary: {}'.format(n_sbt_per_step))
69
 
70
  n_steps = len(subtitles) // n_sbt_per_step if len(subtitles) % n_sbt_per_step == 0 else \
71
  len(subtitles) // n_sbt_per_step + 1
@@ -73,9 +103,7 @@ def process_click_callback():
73
  summaries = []
74
 
75
  for i in stqdm(range(n_steps)):
76
- sbt_txt = ' '.join(subtitles[n_sbt_per_step*i:n_sbt_per_step*(i+1)])
77
- # print('length of text: {}'.format(len(sbt_txt)))
78
- # print(sbt_txt)
79
 
80
  summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small',
81
  max_length=512, truncation=True)
@@ -83,44 +111,59 @@ def process_click_callback():
83
  summary = summarizer(sbt_txt, do_sample=False)
84
  summary = summary[0]['summary_text']
85
 
86
- # print('Summary: ' + summary)
87
  summaries.append(summary)
88
 
89
- out = ' '.join(summaries)
90
- print(out)
91
 
92
- st.session_state.summary_output = out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  st.success('Processing complete!', icon="βœ…")
94
- st.session_state.process_btn = False
95
 
 
96
 
97
 
98
- def main():
 
 
 
99
  st.title('YouTube Video Summary πŸ“ƒ')
100
  st.markdown('Creates summary for given YouTube video URL based on transcripts.')
101
- st.code('https://www.youtube.com/watch?v=aircAruvnKk')
102
- st.code('https://youtu.be/p0G68ORc8uQ')
103
 
104
  col1, col2 = st.columns(2)
105
 
106
  with col1:
107
- video_url = st.text_input('YouTube Video URL:', placeholder='YouTube URL',
108
- label_visibility='collapsed')
109
- st.write(get_videoid_from_url(video_url))
110
 
111
  with col2:
112
- st.button('Process πŸ“­', key='process_btn', on_click=process_click_callback)
113
-
114
- st.text_area(label='', key='summary_output', height=444)
115
-
116
 
117
-
118
-
119
-
120
-
121
- # x = st.slider('Select a value')
122
- # st.write(x, 'squared is', x * x)
123
-
124
-
125
- if __name__ == "__main__":
126
- main()
 
1
  import os
2
 
 
3
  import streamlit as st
4
  from urllib.parse import urlparse, parse_qs
5
 
 
6
  from stqdm import stqdm
7
 
8
  # https://github.com/pytorch/pytorch/issues/77764
9
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
10
 
11
+ from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
12
 
13
+ from transformers import pipeline
14
 
15
  import torch
16
 
17
+ # Setting device for PyTorch
18
  if torch.cuda.is_available():
19
  device = torch.device('cuda')
20
  elif torch.has_mps:
 
23
  device = torch.device('cpu')
24
 
25
 
26
+ class InvalidURLException(Exception):
27
+ pass
28
+
29
 
30
+ def get_videoid_from_url(url: str):
31
+ '''
32
+ Gets video ID from give YouTube video URL
33
+
34
+ :param url: YouTube video URL in 2 formats (standard and short)
35
+ :return: id of YouTube video
36
+ :raises InvalidURLException: If URL is not valid
37
+ '''
38
  url_data = urlparse(url)
39
  query = parse_qs(url_data.query)
40
 
41
+ if ('v' in query) & ('youtube.com' in url_data.netloc):
42
  video_id = query["v"][0]
43
+ elif 'youtu.be' in url_data.netloc:
44
+ path_lst = url.split('/')
45
+
46
+ if path_lst:
47
+ video_id = path_lst[-1]
48
+ else:
49
+ raise InvalidURLException('Invalid URL')
50
+ else:
51
+ raise InvalidURLException('Invalid URL')
52
 
53
  return video_id
54
 
 
 
55
 
56
+ def get_transcripts(url: str):
57
+ '''
58
+ Loads transcripts for given URL
59
+
60
+ :param url: YouTube video URL
61
+ :return: list, list of subtitles
62
+ '''
63
+
64
+ video_id = get_videoid_from_url(video_url_inp)
65
 
66
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
67
 
68
  try:
69
  transcript = transcript_list.find_manually_created_transcript(['en'])
70
+ except NoTranscriptFound as e:
71
+ st.info('No manual transcripts were found, trying to load generated ones...')
72
  transcript = transcript_list.find_generated_transcript(['en'])
73
 
74
  subtitles = transcript.fetch()
75
 
76
  subtitles = [sbt['text'] for sbt in subtitles if sbt['text'] != '[Music]']
 
 
77
 
78
+ return subtitles
79
+
80
+
81
+ def generate_summary(subtitles: list):
82
+ '''
83
+ Creates summary based on subtitles of YouTube video.
84
+
85
+ Uses T5-small model which shows best results for different topics
86
+ of videos.
87
+
88
+ :param subtitles: list of subtitles strings
89
+ :return: summary based on subtitles
90
+ '''
91
+ subtitles_len = [len(sbt) for sbt in subtitles]
92
+ sbt_mean_len = sum(subtitles_len) / len(subtitles_len)
93
 
94
  # Number of subtitles per step/summary
95
  # Since number length of transcripts differs
96
  # between generated and manual ones
97
  # we set different step size
98
  n_sbt_per_step = int(400 / (sbt_mean_len / 4))
 
99
 
100
  n_steps = len(subtitles) // n_sbt_per_step if len(subtitles) % n_sbt_per_step == 0 else \
101
  len(subtitles) // n_sbt_per_step + 1
 
103
  summaries = []
104
 
105
  for i in stqdm(range(n_steps)):
106
+ sbt_txt = ' '.join(subtitles[n_sbt_per_step * i:n_sbt_per_step * (i + 1)])
 
 
107
 
108
  summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small',
109
  max_length=512, truncation=True)
 
111
  summary = summarizer(sbt_txt, do_sample=False)
112
  summary = summary[0]['summary_text']
113
 
 
114
  summaries.append(summary)
115
 
116
+ return ' '.join(summaries)
 
117
 
118
+
119
+ def process_click_callback():
120
+ '''
121
+ Callback for process button click
122
+ '''
123
+ global is_processing
124
+
125
+ if is_processing:
126
+ return
127
+ else:
128
+ is_processing = True
129
+
130
+ global video_url_inp
131
+
132
+ try:
133
+ subtitles = get_transcripts(video_url_inp)
134
+ except InvalidURLException as iue:
135
+ is_processing = False
136
+ st.error('Invalid YouTube URL, please provide URL in format that is shown on Examples')
137
+ st.experimental_rerun()
138
+ except TranscriptsDisabled as tde:
139
+ is_processing = False
140
+ st.error('Could not retrieve a transcript for given ID')
141
+ st.experimental_rerun()
142
+
143
+ summary = generate_summary(subtitles)
144
+
145
+ st.session_state.summary_output = summary
146
  st.success('Processing complete!', icon="βœ…")
 
147
 
148
+ is_processing = False
149
 
150
 
151
+ if __name__ == "__main__":
152
+ # State of processing
153
+ is_processing = False
154
+
155
  st.title('YouTube Video Summary πŸ“ƒ')
156
  st.markdown('Creates summary for given YouTube video URL based on transcripts.')
157
+ st.code('https://www.youtube.com/watch?v=skl4OXNA12U')
158
+ st.code('https://youtu.be/mEQc-iAbEBk')
159
 
160
  col1, col2 = st.columns(2)
161
 
162
  with col1:
163
+ video_url_inp = st.text_input('YouTube Video URL:', placeholder='YouTube URL',
164
+ label_visibility='collapsed')
 
165
 
166
  with col2:
167
+ process_btn = st.button('πŸ—œοΈProcess', key='process_btn', on_click=process_click_callback)
 
 
 
168
 
169
+ summary_out_txt = st.text_area(label='', key='summary_output', height=400)
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ torch
2
  transformers
3
  youtube_transcript_api
4
  tqdm
5
- stqdm
 
 
2
  transformers
3
  youtube_transcript_api
4
  tqdm
5
+ stqdm
6
+ streamlit