Spaces:
Sleeping
Sleeping
File size: 2,950 Bytes
cbeccc1 65bdf91 7f7a851 670c797 8efae88 7f7a851 accedef 803416f 7d60a60 61ccf5a b2e2224 8efae88 b2e2224 8efae88 b2e2224 8efae88 b2e2224 8efae88 b2e2224 61ccf5a 8efae88 b2e2224 8efae88 b2e2224 8efae88 61ccf5a b2e2224 8efae88 b2e2224 7f7a851 4ffeb00 8efae88 61ccf5a 8efae88 053be7e b2e2224 45bc3f2 b2e2224 7a8cb88 0f684ad 9ba855d b2e2224 94d4a00 796d80b 9ba855d b2e2224 73bdf01 6f0fe07 accedef 8efae88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
from transformers import pipeline
from tempfile import NamedTemporaryFile
from PyPDF2 import PdfReader
from IPython.display import Audio
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
import torch
def summarize_abstract_from_pdf(pdf_file_path):
abstract_string = 'abstract'
found_abstract = False
intro_string ='introduction'
extracted_text_string =""
# Read the PDF and extract text from the first page
with open(pdf_file_path, 'rb') as pdf_file:
reader = PdfReader(pdf_file)
text = ""
text += reader.pages[0].extract_text()
file = text.splitlines()
for lines in file:
lower_lines = lines.lower()
if lower_lines.strip()== abstract_string:
found_abstract = True
elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
found_abstract = False
if found_abstract == True:
extracted_text_string += lines
extracted_text_string = extracted_text_string.replace("Abstract", "")
summarizer = pipeline("summarization", "pszemraj/led-base-book-summary",device=0 if torch.cuda.is_available() else -1,)
# Generate a summarized abstract using the specified model
summarized_abstract = summarizer(extracted_text_string,
min_length=16,
max_length=150,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=3.5,
num_beams=4,
early_stopping=True,
)
#I run this twice to get summazired text
summarized_abstract2 = summarizer(summarized_abstract[0]['summary_text'],
min_length=16,
max_length=25,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=3.5,
num_beams=4,
early_stopping=True,
)
# Return the summarized abstract as a string
return summarized_abstract2[0]['summary_text']
def generate_audio_func(pdf_file):
pdf_file_path = pdf_file.name
# Generate audio from text
#call the summarize abstract function
text_prompt = summarize_abstract_from_pdf(pdf_file_path)
audio_array = generate_audio(text_prompt)
# Create a temporary WAV file to save the audio
with NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
wav_file_path = temp_wav_file.name
write_wav(wav_file_path, 22050, (audio_array * 32767).astype(np.int16))
return wav_file_path
# Define app name, app description, and examples
app_name = "PDF to Audio Converter"
app_description = "Convert text from a PDF file to audio. Upload a PDF file. We accept only PDF files with abstracts."
# Create the Gradio app
input_component = gr.File(file_types=["pdf"])
output_component = gr.Audio()
demo = gr.Interface(
fn=generate_audio_func,
inputs=input_component,
outputs=output_component,
title=app_name,
description=app_description
)
demo.launch() |