|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from gtts import gTTS |
|
from io import BytesIO |
|
import pydub |
|
from pydub import AudioSegment |
|
import base64 |
|
import re |
|
|
|
model_name = "ArtifactAI/led_large_16384_arxiv_summarization" |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
def extract_first_sentence(text): |
|
""" |
|
Extracts the first sentence from the given text. |
|
""" |
|
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) |
|
if sentences: |
|
return sentences[0] |
|
else: |
|
return text |
|
|
|
def summarize_pdf_abstract(pdf_file): |
|
""" |
|
Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio. |
|
""" |
|
try: |
|
reader = PdfReader(pdf_file) |
|
abstract_text = "" |
|
for page in reader.pages: |
|
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text(): |
|
abstract_text = page.extract_text() |
|
break |
|
|
|
inputs = tokenizer(abstract_text, return_tensors="pt") |
|
outputs = model.generate(**inputs) |
|
summary = tokenizer.decode(outputs[0]) |
|
|
|
|
|
summary_sentence = extract_first_sentence(summary) |
|
|
|
|
|
speech = gTTS(text=summary_sentence, lang="en") |
|
|
|
|
|
audio_file = AudioSegment.from_file(speech, format="mp3") |
|
audio_file.export("summary.mp3", format="mp3") |
|
|
|
|
|
return summary_sentence, "summary.mp3" |
|
|
|
except Exception as e: |
|
raise Exception(str(e)) |
|
|
|
interface = gr.Interface( |
|
fn=summarize_pdf_abstract, |
|
inputs=[gr.File(label="Upload PDF")], |
|
outputs=[gr.Textbox(label="Summary"), gr.Audio()], |
|
) |
|
|
|
interface.launch(share=True) |
|
|