BilalSardar commited on
Commit
368af9f
1 Parent(s): a356cd0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from pathlib import Path
4
+ from pydub import AudioSegment
5
+ from pydub.utils import make_chunks
6
+ import os
7
+ import warnings
8
+ import speech_recognition as sr
9
+ import torch
10
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
11
+ import nltk
12
+ from flashtext import KeywordProcessor
13
+ from collections import OrderedDict
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+
16
+ nltk.download('punkt')
17
+ nltk.download('brown')
18
+ nltk.download('wordnet')
19
+ nltk.download('stopwords')
20
+ from nltk.corpus import wordnet as wn
21
+ from nltk.tokenize import sent_tokenize
22
+ from textwrap3 import wrap
23
+ import random
24
+ import numpy as np
25
+ from nltk.corpus import stopwords
26
+ import string
27
+ import pke
28
+ import traceback
29
+
30
+
31
+ warnings.filterwarnings("ignore")
32
+
33
+
34
+ def Process_audio(fileName):
35
+
36
+ txtf=open("The_audio.txt","w+")
37
+ myaudio=AudioSegment.from_wav(fileName)
38
+ chunks_length_ms=8000
39
+ chunks=make_chunks(myaudio,chunks_length_ms)
40
+ for i, chunk in enumerate(chunks):
41
+ chunkName='./chunked/'+fileName+"_{0}.wav".format(i)
42
+ print("I am Exporting",chunkName)
43
+ chunk.export(chunkName,format="wav")
44
+ File=chunkName
45
+ r= sr.Recognizer()
46
+ with sr.AudioFile(File) as source:
47
+ audio_listened=r.listen(source)
48
+
49
+ try:
50
+ rec=r.recognize_google(audio_listened)
51
+ txtf.write(rec+".")
52
+
53
+ except sr.UnknownValueError:
54
+ print("I dont recognize your audio")
55
+ except sr.RequestError as e:
56
+ print("could not get result")
57
+
58
+ try:
59
+ os.makedirs("chunked")
60
+ except:
61
+ pass
62
+
63
+ def UrlToAudio(VideoUrl):
64
+ url=VideoUrl
65
+ os.system("youtube-dl -x --audio-format wav " + url)
66
+ # load audio and pad/trim it to fit 30 seconds
67
+ base_path = Path(r"")
68
+ for wav_file_path in base_path.glob("*.wav"):
69
+ Process_audio(str(wav_file_path))
70
+ break
71
+
72
+
73
+ summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
74
+ summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
75
+
76
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
77
+ summary_model = summary_model.to(device)
78
+
79
+
80
+ def set_seed(seed: int):
81
+ random.seed(seed)
82
+ np.random.seed(seed)
83
+ torch.manual_seed(seed)
84
+ torch.cuda.manual_seed_all(seed)
85
+
86
+ def postprocesstext (content):
87
+ final=""
88
+ for sent in sent_tokenize(content):
89
+ sent = sent.capitalize()
90
+ final = final +" "+sent
91
+ return final
92
+
93
+
94
+ def summarizer(text,model,tokenizer):
95
+ text = text.strip().replace("\n"," ")
96
+ text = "summarize: "+text
97
+ # print (text)
98
+ max_len = 512
99
+ encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
100
+
101
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
102
+
103
+ outs = model.generate(input_ids=input_ids,
104
+ attention_mask=attention_mask,
105
+ early_stopping=True,
106
+ num_beams=3,
107
+ num_return_sequences=1,
108
+ no_repeat_ngram_size=2,
109
+ min_length = 75,
110
+ max_length=300)
111
+
112
+
113
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
114
+ summary = dec[0]
115
+ summary = postprocesstext(summary)
116
+ summary= summary.strip()
117
+
118
+ return summary
119
+
120
+
121
+ def get_nouns_multipartite(content):
122
+ out=[]
123
+ try:
124
+ extractor = pke.unsupervised.MultipartiteRank()
125
+
126
+ # not contain punctuation marks or stopwords as candidates.
127
+ pos = {'PROPN','NOUN'}
128
+ #pos = {'PROPN','NOUN'}
129
+ stoplist = list(string.punctuation)
130
+ stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
131
+ stoplist += stopwords.words('english')
132
+
133
+ extractor.load_document(input=content,language='en',
134
+ stoplist=stoplist,
135
+ normalization=None)
136
+
137
+ extractor.candidate_selection(pos=pos)
138
+ # 4. build the Multipartite graph and rank candidates using random walk,
139
+ # alpha controls the weight adjustment mechanism, see TopicRank for
140
+ # threshold/method parameters.
141
+ extractor.candidate_weighting(alpha=1.1,
142
+ threshold=0.75,
143
+ method='average')
144
+ keyphrases = extractor.get_n_best(n=15)
145
+
146
+
147
+ for val in keyphrases:
148
+ out.append(val[0])
149
+ except:
150
+ out = []
151
+ traceback.print_exc()
152
+
153
+ return out
154
+
155
+ def get_keywords(originaltext,summarytext):
156
+ keywords = get_nouns_multipartite(originaltext)
157
+ print ("keywords unsummarized: ",keywords)
158
+ keyword_processor = KeywordProcessor()
159
+ for keyword in keywords:
160
+ keyword_processor.add_keyword(keyword)
161
+
162
+ keywords_found = keyword_processor.extract_keywords(summarytext)
163
+ keywords_found = list(set(keywords_found))
164
+ print ("keywords_found in summarized: ",keywords_found)
165
+
166
+ important_keywords =[]
167
+ for keyword in keywords:
168
+ if keyword in keywords_found:
169
+ important_keywords.append(keyword)
170
+
171
+ return important_keywords[:4]
172
+
173
+ question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
174
+ question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
175
+ question_model = question_model.to(device)
176
+
177
+ def get_question(context,answer,model,tokenizer):
178
+ text = "context: {} answer: {}".format(context,answer)
179
+ encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
180
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
181
+
182
+ outs = model.generate(input_ids=input_ids,
183
+ attention_mask=attention_mask,
184
+ early_stopping=True,
185
+ num_beams=5,
186
+ num_return_sequences=1,
187
+ no_repeat_ngram_size=2,
188
+ max_length=72)
189
+
190
+
191
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
192
+
193
+
194
+ Question = dec[0].replace("question:","")
195
+ Question= Question.strip()
196
+ return Question
197
+ def get_distractors_wordnet(word):
198
+ distractors=[]
199
+ try:
200
+ syn = wn.synsets(word,'n')[0]
201
+
202
+ word= word.lower()
203
+ orig_word = word
204
+ if len(word.split())>0:
205
+ word = word.replace(" ","_")
206
+ hypernym = syn.hypernyms()
207
+ if len(hypernym) == 0:
208
+ return distractors
209
+ for item in hypernym[0].hyponyms():
210
+ name = item.lemmas()[0].name()
211
+ #print ("name ",name, " word",orig_word)
212
+ if name == orig_word:
213
+ continue
214
+ name = name.replace("_"," ")
215
+ name = " ".join(w.capitalize() for w in name.split())
216
+ if name is not None and name not in distractors:
217
+ distractors.append(name)
218
+ except:
219
+ print ("Wordnet distractors not found")
220
+ return distractors
221
+
222
+
223
+
224
+ context1 = gr.inputs.Textbox(lines=10, placeholder="Enter link here...")
225
+ output = gr.outputs.HTML( label="Question and Answers")
226
+ radiobutton = gr.inputs.Radio(["Wordnet", "Gensim"])
227
+
228
+ def generate_question(context1,radiobutton):
229
+ UrlToAudio(context1)
230
+ f = open("The_audio.txt", "r")
231
+ context=f.read()
232
+ summary_text = summarizer(context,summary_model,summary_tokenizer)
233
+ for wrp in wrap(summary_text, 150):
234
+ print (wrp)
235
+ # np = getnounphrases(summary_text,sentence_transformer_model,3)
236
+ np = get_keywords(context,summary_text)
237
+ print ("\n\nNoun phrases",np)
238
+ output=""
239
+ for answer in np:
240
+ ques = get_question(summary_text,answer,question_model,question_tokenizer)
241
+ if radiobutton=="Wordnet":
242
+ distractors = get_distractors_wordnet(answer)
243
+ #else:
244
+ #distractors = generate_distractors(answer.capitalize(),3)
245
+ #print(distractors)
246
+
247
+ # output= output + ques + "\n" + "Ans: "+answer.capitalize() + "\n\n"
248
+ output ="\n"+ output + "<b style='color:blue;'>" + ques + "</b>"
249
+ # output = output + "<br>"
250
+ output ="\n"+ output + "<b style='color:green;'>" + "Ans: " +answer.capitalize()+ "</b>"
251
+ if len(distractors)>0:
252
+ for distractor in distractors[:4]:
253
+ output = output + "<b style='color:brown;'>" + distractor+ "</b>\n"
254
+ output = output + "<br>"
255
+
256
+ summary ="Summary: "+ summary_text
257
+ for answer in np:
258
+ summary = summary.replace(answer,"<b>"+answer+"</b>")
259
+ summary = summary.replace(answer.capitalize(),"<b>"+answer.capitalize()+"</b>")
260
+ output = output + "<p>"+summary+"</p>"
261
+ return output
262
+
263
+
264
+ iface = gr.Interface(
265
+ fn=generate_question,
266
+ inputs=[context1,radiobutton],
267
+ title="VidQuest",
268
+ outputs=output)
269
+ iface.launch(debug=True)