kahennefer commited on
Commit
8efae88
1 Parent(s): 61ccf5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -46
app.py CHANGED
@@ -1,82 +1,87 @@
1
- import gradio as gr
2
  from transformers import pipeline
3
  from tempfile import NamedTemporaryFile
4
  from PyPDF2 import PdfReader
 
 
5
  from bark import SAMPLE_RATE, generate_audio, preload_models
6
  from scipy.io.wavfile import write as write_wav
7
  import torch
8
 
9
- # Initialize the summarization pipeline
10
- summarizer = pipeline("summarization", "pszemraj/led-base-book-summary", device=0 if torch.cuda.is_available() else -1)
11
-
12
- # Function to summarize abstract from PDF
13
  def summarize_abstract_from_pdf(pdf_file_path):
 
 
 
14
  abstract_string = 'abstract'
15
- intro_string = 'introduction'
16
  found_abstract = False
17
- extracted_text_string = ""
 
18
 
19
  # Read the PDF and extract text from the first page
20
  with open(pdf_file_path, 'rb') as pdf_file:
21
  reader = PdfReader(pdf_file)
22
- text = reader.pages[0].extract_text()
 
 
23
 
24
  file = text.splitlines()
25
  for lines in file:
26
- lower_lines = lines.lower()
27
- if lower_lines.strip() == abstract_string:
28
- found_abstract = True
29
- elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
30
- found_abstract = False
31
 
32
- if found_abstract:
33
- extracted_text_string += lines
34
 
35
- extracted_text_string = extracted_text_string.replace("Abstract", "")
36
 
 
 
37
  # Generate a summarized abstract using the specified model
38
- summarized_abstract = summarizer(
39
- extracted_text_string,
40
- min_length=16,
41
- max_length=150,
42
- no_repeat_ngram_size=3,
43
- encoder_no_repeat_ngram_size=3,
44
- repetition_penalty=3.5,
45
- num_beams=4,
46
- early_stopping=True,
47
  )
48
-
49
- # Run summarization twice to get the summarized text
50
- summarized_abstract2 = summarizer(
51
- summarized_abstract[0]['summary_text'],
52
- min_length=16,
53
- max_length=25,
54
- no_repeat_ngram_size=3,
55
- encoder_no_repeat_ngram_size=3,
56
- repetition_penalty=3.5,
57
- num_beams=4,
58
- early_stopping=True,
59
  )
60
 
 
 
61
  # Return the summarized abstract as a string
62
  return summarized_abstract2[0]['summary_text']
63
 
64
- # Function to generate audio from PDF
65
  def generate_audio_func(pdf_file):
66
  model_name = "suno/bark-small"
67
-
68
- # Download and load the specified model
69
  preload_models(model_name)
70
-
71
  # Access the input file path
72
  pdf_file_path = pdf_file.name
73
-
74
- # Generate audio from text
75
- # Call the summarize abstract function
76
- text_prompt = summarize_abstract_from_pdf(pdf_file_path)
77
  audio_array = generate_audio(text_prompt)
 
 
 
 
 
 
 
78
 
79
- return audio_array # Return the audio data as a NumPy array
80
 
81
  # Define app name, app description, and examples
82
  app_name = "PDF to Audio Converter"
@@ -94,4 +99,4 @@ demo = gr.Interface(
94
  description=app_description
95
  )
96
 
97
- demo.launch()
 
 
1
  from transformers import pipeline
2
  from tempfile import NamedTemporaryFile
3
  from PyPDF2 import PdfReader
4
+ from IPython.display import Audio
5
+ import numpy as np
6
  from bark import SAMPLE_RATE, generate_audio, preload_models
7
  from scipy.io.wavfile import write as write_wav
8
  import torch
9
 
 
 
 
 
10
  def summarize_abstract_from_pdf(pdf_file_path):
11
+
12
+ # Initialize the summarization pipeline
13
+
14
  abstract_string = 'abstract'
 
15
  found_abstract = False
16
+ intro_string ='introduction'
17
+ extracted_text_string =""
18
 
19
  # Read the PDF and extract text from the first page
20
  with open(pdf_file_path, 'rb') as pdf_file:
21
  reader = PdfReader(pdf_file)
22
+ text = ""
23
+ text += reader.pages[0].extract_text()
24
+
25
 
26
  file = text.splitlines()
27
  for lines in file:
28
+ lower_lines = lines.lower()
29
+ if lower_lines.strip()== abstract_string:
30
+ found_abstract = True
31
+ elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
32
+ found_abstract = False
33
 
34
+ if found_abstract == True:
35
+ extracted_text_string += lines
36
 
 
37
 
38
+ extracted_text_string = extracted_text_string.replace("Abstract", "")
39
+ summarizer = pipeline("summarization", "pszemraj/led-base-book-summary",device=0 if torch.cuda.is_available() else -1,)
40
  # Generate a summarized abstract using the specified model
41
+ summarized_abstract = summarizer(extracted_text_string,
42
+ min_length=16,
43
+ max_length=150,
44
+ no_repeat_ngram_size=3,
45
+ encoder_no_repeat_ngram_size=3,
46
+ repetition_penalty=3.5,
47
+ num_beams=4,
48
+ early_stopping=True,
 
49
  )
50
+ #I run this twice to get summazired text
51
+ summarized_abstract2 = summarizer(summarized_abstract[0]['summary_text'],
52
+ min_length=16,
53
+ max_length=25,
54
+ no_repeat_ngram_size=3,
55
+ encoder_no_repeat_ngram_size=3,
56
+ repetition_penalty=3.5,
57
+ num_beams=4,
58
+ early_stopping=True,
 
 
59
  )
60
 
61
+
62
+
63
  # Return the summarized abstract as a string
64
  return summarized_abstract2[0]['summary_text']
65
 
 
66
  def generate_audio_func(pdf_file):
67
  model_name = "suno/bark-small"
68
+ # Download and load the specified model
 
69
  preload_models(model_name)
 
70
  # Access the input file path
71
  pdf_file_path = pdf_file.name
72
+
73
+ # Generate audio from text
74
+ #call the summarize abstract function
75
+ text_prompt = summarize_abstract_from_pdf(pdf_file_path)
76
  audio_array = generate_audio(text_prompt)
77
+
78
+ # Create a temporary WAV file to save the audio
79
+ with NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
80
+ wav_file_path = temp_wav_file.name
81
+ write_wav(wav_file_path, 22050, (audio_array * 32767).astype(np.int16))
82
+ return wav_file_path
83
+
84
 
 
85
 
86
  # Define app name, app description, and examples
87
  app_name = "PDF to Audio Converter"
 
99
  description=app_description
100
  )
101
 
102
+ demo.launch()