Spaces:
Runtime error
Runtime error
storresbusquets
commited on
Commit
·
8cae8bc
1
Parent(s):
c6818ef
Update app.py
Browse files
app.py
CHANGED
@@ -44,18 +44,18 @@ class GradioInference:
|
|
44 |
self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
|
45 |
self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
|
46 |
|
47 |
-
self.llm_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
|
48 |
|
49 |
-
self.pipeline = pipeline(
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
)
|
59 |
|
60 |
|
61 |
def __call__(self, link, lang, size, progress=gr.Progress()):
|
@@ -214,53 +214,53 @@ class GradioInference:
|
|
214 |
progress(0.40, desc="Summarizing")
|
215 |
|
216 |
# Perform summarization on the transcription
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
|
221 |
########################## PRUEBA LLM #################################
|
222 |
-
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
|
223 |
|
224 |
-
llm = HuggingFacePipeline(pipeline = self.pipeline, model_kwargs = {'temperature':0})
|
225 |
|
226 |
-
template = """
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
|
232 |
-
prompt = PromptTemplate(template=template, input_variables=["text"])
|
233 |
|
234 |
-
llm_chain = LLMChain(prompt=prompt, llm=llm)
|
235 |
|
236 |
-
text = results["text"]
|
237 |
|
238 |
-
summ = llm_chain.run(text)
|
239 |
########################## FIN PRUEBA LLM #################################
|
240 |
|
241 |
#### Prueba
|
242 |
-
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
#### Fin prueba
|
265 |
|
266 |
progress(0.50, desc="Extracting Keywords")
|
@@ -303,8 +303,8 @@ class GradioInference:
|
|
303 |
if lang == "english":
|
304 |
return (
|
305 |
results["text"],
|
306 |
-
summ,
|
307 |
-
|
308 |
formatted_keywords,
|
309 |
formatted_sentiment,
|
310 |
wordcloud_image,
|
@@ -312,8 +312,8 @@ class GradioInference:
|
|
312 |
else:
|
313 |
return (
|
314 |
results["text"],
|
315 |
-
summ,
|
316 |
-
|
317 |
formatted_keywords,
|
318 |
formatted_sentiment,
|
319 |
wordcloud_image,
|
|
|
44 |
self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
|
45 |
self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
|
46 |
|
47 |
+
# self.llm_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
|
48 |
|
49 |
+
# self.pipeline = pipeline(
|
50 |
+
# "text-generation", #task
|
51 |
+
# model="tiiuae/falcon-7b-instruct",
|
52 |
+
# tokenizer=self.llm_tokenizer,
|
53 |
+
# trust_remote_code=True,
|
54 |
+
# do_sample=True,
|
55 |
+
# top_k=10,
|
56 |
+
# num_return_sequences=1,
|
57 |
+
# eos_token_id=self.tokenizer.eos_token_id
|
58 |
+
# )
|
59 |
|
60 |
|
61 |
def __call__(self, link, lang, size, progress=gr.Progress()):
|
|
|
214 |
progress(0.40, desc="Summarizing")
|
215 |
|
216 |
# Perform summarization on the transcription
|
217 |
+
transcription_summary = self.summarizer(
|
218 |
+
results["text"], max_length=150, min_length=30, do_sample=False
|
219 |
+
)
|
220 |
|
221 |
########################## PRUEBA LLM #################################
|
222 |
+
# from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
|
223 |
|
224 |
+
# llm = HuggingFacePipeline(pipeline = self.pipeline, model_kwargs = {'temperature':0})
|
225 |
|
226 |
+
# template = """
|
227 |
+
# Write a concise summary of the following text delimited by triple backquotes.
|
228 |
+
# ```{text}```
|
229 |
+
# CONCISE SUMMARY:
|
230 |
+
# """
|
231 |
|
232 |
+
# prompt = PromptTemplate(template=template, input_variables=["text"])
|
233 |
|
234 |
+
# llm_chain = LLMChain(prompt=prompt, llm=llm)
|
235 |
|
236 |
+
# text = results["text"]
|
237 |
|
238 |
+
# summ = llm_chain.run(text)
|
239 |
########################## FIN PRUEBA LLM #################################
|
240 |
|
241 |
#### Prueba
|
242 |
+
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
243 |
|
244 |
+
input_ids_sum = self.tokenizer(
|
245 |
+
[WHITESPACE_HANDLER(results["text"])],
|
246 |
+
return_tensors="pt",
|
247 |
+
padding="max_length",
|
248 |
+
truncation=True,
|
249 |
+
max_length=512
|
250 |
+
)["input_ids"]
|
251 |
|
252 |
+
output_ids_sum = self.model.generate(
|
253 |
+
input_ids=input_ids_sum,
|
254 |
+
max_length=130,
|
255 |
+
no_repeat_ngram_size=2,
|
256 |
+
num_beams=4
|
257 |
+
)[0]
|
258 |
|
259 |
+
summary = self.tokenizer.decode(
|
260 |
+
output_ids_sum,
|
261 |
+
skip_special_tokens=True,
|
262 |
+
clean_up_tokenization_spaces=False
|
263 |
+
)
|
264 |
#### Fin prueba
|
265 |
|
266 |
progress(0.50, desc="Extracting Keywords")
|
|
|
303 |
if lang == "english":
|
304 |
return (
|
305 |
results["text"],
|
306 |
+
# summ,
|
307 |
+
transcription_summary[0]["summary_text"],
|
308 |
formatted_keywords,
|
309 |
formatted_sentiment,
|
310 |
wordcloud_image,
|
|
|
312 |
else:
|
313 |
return (
|
314 |
results["text"],
|
315 |
+
# summ,
|
316 |
+
summary,
|
317 |
formatted_keywords,
|
318 |
formatted_sentiment,
|
319 |
wordcloud_image,
|