IanRonk commited on
Commit
66ee2b4
·
1 Parent(s): bffc7a4

Add punctuation part

Browse files
Files changed (4) hide show
  1. __init__.py +0 -0
  2. app.py +7 -25
  3. functions/__init__.py +0 -0
  4. functions/punctuation.py +58 -0
__init__.py ADDED
File without changes
app.py CHANGED
@@ -1,31 +1,13 @@
 
1
  import gradio as gr
2
- import requests
3
- from youtube_transcript_api import YouTubeTranscriptApi
4
- import json
5
 
6
 
7
- def retrieve_url(vid_id):
8
- try:
9
- transcript = YouTubeTranscriptApi.get_transcript(vid_id)
10
- except Exception as e:
11
- raise e
12
- req = requests.get(
13
- f"https://yt.lemnoslife.com/noKey/videos?part=snippet&id={vid_id}"
14
- )
15
- if req.status_code == 200:
16
- information = json.loads(req.content)
17
- else:
18
- # print(req.status_code)
19
- information = None
20
- return ""
21
- # print(transcript)
22
- text = " ".join([x["text"] for x in transcript])
23
- return text
24
 
25
 
26
- def greet(name):
27
- return "Hello " + name + "!!"
28
-
29
-
30
- demo = gr.Interface(fn=retrieve_url, inputs="text", outputs="text")
31
  demo.launch(share=True)
 
1
+ from os import pipe
2
  import gradio as gr
3
+ from functions.punctuation import punctuate
 
 
4
 
5
 
6
+ def pipeline(video_id):
7
+ punctuated_text = punctuate(video_id)
8
+ return punctuated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
+ # print(pipeline("VL5M5ZihJK4"))
12
+ demo = gr.Interface(fn=pipeline, inputs="text", outputs="text")
 
 
 
13
  demo.launch(share=True)
functions/__init__.py ADDED
File without changes
functions/punctuation.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from youtube_transcript_api import YouTubeTranscriptApi
3
+ import json
4
+ import os
5
+
6
+ headers = {
7
+ "Authorization": f"Bearer {os.environ['HF_Token']}"
8
+ } # NOTE: put this somewhere else
9
+
10
+
11
+ def retrieve_transcript(vid_id):
12
+ try:
13
+ transcript = YouTubeTranscriptApi.get_transcript(vid_id)
14
+ return transcript
15
+ except Exception as e:
16
+ return None
17
+
18
+
19
+ def split_transcript(transcript, chunk_size=40):
20
+ sentences = []
21
+ for i in range(0, len(transcript), chunk_size):
22
+ to_add = [x["text"] for x in transcript[i : i + chunk_size]]
23
+ sentences.append(" ".join(to_add))
24
+ return sentences
25
+
26
+
27
+ def query_punctuation(splits):
28
+ payload = {"inputs": splits}
29
+ API_URL = "https://api-inference.huggingface.co/models/oliverguhr/fullstop-punctuation-multilang-large"
30
+ response = requests.post(API_URL, headers=headers, json=payload)
31
+ return response.json()
32
+
33
+
34
+ def parse_output(output, comb):
35
+ total = []
36
+
37
+ # loop over the response from the huggingface api
38
+ for i, o in enumerate(output):
39
+ added = 0
40
+ tt = comb[i]
41
+ for elem in o:
42
+ # Loop over the output chunks and add the . and ?
43
+ if elem["entity_group"] not in ["0", ",", ""]:
44
+ split = elem["end"] + added
45
+ tt = tt[:split] + elem["entity_group"] + tt[split:]
46
+ added += 1
47
+ total.append(tt)
48
+ return " ".join(total)
49
+
50
+
51
+ def punctuate(video_id):
52
+ transcript = retrieve_transcript(video_id)
53
+ splits = split_transcript(
54
+ transcript
55
+ ) # Get the transcript from the YoutubeTranscriptApi
56
+ resp = query_punctuation(splits) # Get the response from the Inference API
57
+ punctuated_transcript = parse_output(resp, splits)
58
+ return punctuated_transcript