kahennefer commited on
Commit
61ccf5a
·
1 Parent(s): 803416f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -56
app.py CHANGED
@@ -2,88 +2,81 @@ import gradio as gr
2
  from transformers import pipeline
3
  from tempfile import NamedTemporaryFile
4
  from PyPDF2 import PdfReader
5
- from IPython.display import Audio
6
- import numpy as np
7
  from bark import SAMPLE_RATE, generate_audio, preload_models
8
  from scipy.io.wavfile import write as write_wav
9
  import torch
10
 
11
- def summarize_abstract_from_pdf(pdf_file_path):
12
- # Initialize the summarization pipeline
13
 
 
 
14
  abstract_string = 'abstract'
 
15
  found_abstract = False
16
- intro_string ='introduction'
17
- extracted_text_string =""
18
 
19
  # Read the PDF and extract text from the first page
20
  with open(pdf_file_path, 'rb') as pdf_file:
21
  reader = PdfReader(pdf_file)
22
- text = ""
23
- text += reader.pages[0].extract_text()
24
-
25
 
26
  file = text.splitlines()
27
  for lines in file:
28
- lower_lines = lines.lower()
29
- if lower_lines.strip()== abstract_string:
30
- found_abstract = True
31
- elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
32
- found_abstract = False
33
-
34
- if found_abstract == True:
35
- extracted_text_string += lines
36
 
 
 
37
 
38
  extracted_text_string = extracted_text_string.replace("Abstract", "")
39
- summarizer = pipeline("summarization", "pszemraj/led-base-book-summary",device=0 if torch.cuda.is_available() else -1,)
40
  # Generate a summarized abstract using the specified model
41
- summarized_abstract = summarizer(extracted_text_string,
42
- min_length=16,
43
- max_length=150,
44
- no_repeat_ngram_size=3,
45
- encoder_no_repeat_ngram_size=3,
46
- repetition_penalty=3.5,
47
- num_beams=4,
48
- early_stopping=True,
49
- )
50
- #I run this twice to get summazired text
51
- summarized_abstract2 = summarizer(summarized_abstract[0]['summary_text'],
52
- min_length=16,
53
- max_length=25,
54
- no_repeat_ngram_size=3,
55
- encoder_no_repeat_ngram_size=3,
56
- repetition_penalty=3.5,
57
- num_beams=4,
58
- early_stopping=True,
59
  )
60
 
61
-
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Return the summarized abstract as a string
64
  return summarized_abstract2[0]['summary_text']
65
 
 
66
  def generate_audio_func(pdf_file):
67
-
68
-
69
- model_name = "suno/bark-small"
70
- # Download and load the specified model
71
- preload_models(model_name)
72
  # Access the input file path
73
- pdf_file_path = pdf_file.name
74
-
75
- # Generate audio from text
76
- #call the summarize abstract function
77
- text_prompt = summarize_abstract_from_pdf(pdf_file_path)
78
- audio_array = generate_audio(text_prompt)
79
- # Create a temporary WAV file to save the audio
80
- with NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
81
- wav_file_path = temp_wav_file.name
82
- write_wav(wav_file_path, 22050, (audio_array * 32767).astype(np.int16))
83
-
84
- return wav_file_path
85
-
86
 
 
87
 
88
  # Define app name, app description, and examples
89
  app_name = "PDF to Audio Converter"
@@ -101,4 +94,4 @@ demo = gr.Interface(
101
  description=app_description
102
  )
103
 
104
- demo.launch()
 
2
  from transformers import pipeline
3
  from tempfile import NamedTemporaryFile
4
  from PyPDF2 import PdfReader
 
 
5
  from bark import SAMPLE_RATE, generate_audio, preload_models
6
  from scipy.io.wavfile import write as write_wav
7
  import torch
8
 
9
+ # Initialize the summarization pipeline
10
+ summarizer = pipeline("summarization", "pszemraj/led-base-book-summary", device=0 if torch.cuda.is_available() else -1)
11
 
12
+ # Function to summarize abstract from PDF
13
+ def summarize_abstract_from_pdf(pdf_file_path):
14
  abstract_string = 'abstract'
15
+ intro_string = 'introduction'
16
  found_abstract = False
17
+ extracted_text_string = ""
 
18
 
19
  # Read the PDF and extract text from the first page
20
  with open(pdf_file_path, 'rb') as pdf_file:
21
  reader = PdfReader(pdf_file)
22
+ text = reader.pages[0].extract_text()
 
 
23
 
24
  file = text.splitlines()
25
  for lines in file:
26
+ lower_lines = lines.lower()
27
+ if lower_lines.strip() == abstract_string:
28
+ found_abstract = True
29
+ elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
30
+ found_abstract = False
 
 
 
31
 
32
+ if found_abstract:
33
+ extracted_text_string += lines
34
 
35
  extracted_text_string = extracted_text_string.replace("Abstract", "")
36
+
37
  # Generate a summarized abstract using the specified model
38
+ summarized_abstract = summarizer(
39
+ extracted_text_string,
40
+ min_length=16,
41
+ max_length=150,
42
+ no_repeat_ngram_size=3,
43
+ encoder_no_repeat_ngram_size=3,
44
+ repetition_penalty=3.5,
45
+ num_beams=4,
46
+ early_stopping=True,
 
 
 
 
 
 
 
 
 
47
  )
48
 
49
+ # Run summarization twice to get the summarized text
50
+ summarized_abstract2 = summarizer(
51
+ summarized_abstract[0]['summary_text'],
52
+ min_length=16,
53
+ max_length=25,
54
+ no_repeat_ngram_size=3,
55
+ encoder_no_repeat_ngram_size=3,
56
+ repetition_penalty=3.5,
57
+ num_beams=4,
58
+ early_stopping=True,
59
+ )
60
 
61
  # Return the summarized abstract as a string
62
  return summarized_abstract2[0]['summary_text']
63
 
64
+ # Function to generate audio from PDF
65
  def generate_audio_func(pdf_file):
66
+ model_name = "suno/bark-small"
67
+
68
+ # Download and load the specified model
69
+ preload_models(model_name)
70
+
71
  # Access the input file path
72
+ pdf_file_path = pdf_file.name
73
+
74
+ # Generate audio from text
75
+ # Call the summarize abstract function
76
+ text_prompt = summarize_abstract_from_pdf(pdf_file_path)
77
+ audio_array = generate_audio(text_prompt)
 
 
 
 
 
 
 
78
 
79
+ return audio_array # Return the audio data as a NumPy array
80
 
81
  # Define app name, app description, and examples
82
  app_name = "PDF to Audio Converter"
 
94
  description=app_description
95
  )
96
 
97
+ demo.launch()