storresbusquets commited on
Commit
8cae8bc
·
1 Parent(s): c6818ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -48
app.py CHANGED
@@ -44,18 +44,18 @@ class GradioInference:
44
  self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
45
  self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
46
 
47
- self.llm_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
48
 
49
- self.pipeline = pipeline(
50
- "text-generation", #task
51
- model="tiiuae/falcon-7b-instruct",
52
- tokenizer=self.llm_tokenizer,
53
- trust_remote_code=True,
54
- do_sample=True,
55
- top_k=10,
56
- num_return_sequences=1,
57
- eos_token_id=self.tokenizer.eos_token_id
58
- )
59
 
60
 
61
  def __call__(self, link, lang, size, progress=gr.Progress()):
@@ -214,53 +214,53 @@ class GradioInference:
214
  progress(0.40, desc="Summarizing")
215
 
216
  # Perform summarization on the transcription
217
- # transcription_summary = self.summarizer(
218
- # results["text"], max_length=150, min_length=30, do_sample=False
219
- # )
220
 
221
  ########################## PRUEBA LLM #################################
222
- from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
223
 
224
- llm = HuggingFacePipeline(pipeline = self.pipeline, model_kwargs = {'temperature':0})
225
 
226
- template = """
227
- Write a concise summary of the following text delimited by triple backquotes.
228
- ```{text}```
229
- CONCISE SUMMARY:
230
- """
231
 
232
- prompt = PromptTemplate(template=template, input_variables=["text"])
233
 
234
- llm_chain = LLMChain(prompt=prompt, llm=llm)
235
 
236
- text = results["text"]
237
 
238
- summ = llm_chain.run(text)
239
  ########################## FIN PRUEBA LLM #################################
240
 
241
  #### Prueba
242
- # WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
243
 
244
- # input_ids_sum = self.tokenizer(
245
- # [WHITESPACE_HANDLER(results["text"])],
246
- # return_tensors="pt",
247
- # padding="max_length",
248
- # truncation=True,
249
- # max_length=512
250
- # )["input_ids"]
251
 
252
- # output_ids_sum = self.model.generate(
253
- # input_ids=input_ids_sum,
254
- # max_length=130,
255
- # no_repeat_ngram_size=2,
256
- # num_beams=4
257
- # )[0]
258
 
259
- # summary = self.tokenizer.decode(
260
- # output_ids_sum,
261
- # skip_special_tokens=True,
262
- # clean_up_tokenization_spaces=False
263
- # )
264
  #### Fin prueba
265
 
266
  progress(0.50, desc="Extracting Keywords")
@@ -303,8 +303,8 @@ class GradioInference:
303
  if lang == "english":
304
  return (
305
  results["text"],
306
- summ,
307
- # transcription_summary[0]["summary_text"],
308
  formatted_keywords,
309
  formatted_sentiment,
310
  wordcloud_image,
@@ -312,8 +312,8 @@ class GradioInference:
312
  else:
313
  return (
314
  results["text"],
315
- summ,
316
- # summary,
317
  formatted_keywords,
318
  formatted_sentiment,
319
  wordcloud_image,
 
44
  self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
45
  self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
46
 
47
+ # self.llm_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
48
 
49
+ # self.pipeline = pipeline(
50
+ # "text-generation", #task
51
+ # model="tiiuae/falcon-7b-instruct",
52
+ # tokenizer=self.llm_tokenizer,
53
+ # trust_remote_code=True,
54
+ # do_sample=True,
55
+ # top_k=10,
56
+ # num_return_sequences=1,
57
+ # eos_token_id=self.tokenizer.eos_token_id
58
+ # )
59
 
60
 
61
  def __call__(self, link, lang, size, progress=gr.Progress()):
 
214
  progress(0.40, desc="Summarizing")
215
 
216
  # Perform summarization on the transcription
217
+ transcription_summary = self.summarizer(
218
+ results["text"], max_length=150, min_length=30, do_sample=False
219
+ )
220
 
221
  ########################## PRUEBA LLM #################################
222
+ # from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
223
 
224
+ # llm = HuggingFacePipeline(pipeline = self.pipeline, model_kwargs = {'temperature':0})
225
 
226
+ # template = """
227
+ # Write a concise summary of the following text delimited by triple backquotes.
228
+ # ```{text}```
229
+ # CONCISE SUMMARY:
230
+ # """
231
 
232
+ # prompt = PromptTemplate(template=template, input_variables=["text"])
233
 
234
+ # llm_chain = LLMChain(prompt=prompt, llm=llm)
235
 
236
+ # text = results["text"]
237
 
238
+ # summ = llm_chain.run(text)
239
  ########################## FIN PRUEBA LLM #################################
240
 
241
  #### Prueba
242
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
243
 
244
+ input_ids_sum = self.tokenizer(
245
+ [WHITESPACE_HANDLER(results["text"])],
246
+ return_tensors="pt",
247
+ padding="max_length",
248
+ truncation=True,
249
+ max_length=512
250
+ )["input_ids"]
251
 
252
+ output_ids_sum = self.model.generate(
253
+ input_ids=input_ids_sum,
254
+ max_length=130,
255
+ no_repeat_ngram_size=2,
256
+ num_beams=4
257
+ )[0]
258
 
259
+ summary = self.tokenizer.decode(
260
+ output_ids_sum,
261
+ skip_special_tokens=True,
262
+ clean_up_tokenization_spaces=False
263
+ )
264
  #### Fin prueba
265
 
266
  progress(0.50, desc="Extracting Keywords")
 
303
  if lang == "english":
304
  return (
305
  results["text"],
306
+ # summ,
307
+ transcription_summary[0]["summary_text"],
308
  formatted_keywords,
309
  formatted_sentiment,
310
  wordcloud_image,
 
312
  else:
313
  return (
314
  results["text"],
315
+ # summ,
316
+ summary,
317
  formatted_keywords,
318
  formatted_sentiment,
319
  wordcloud_image,