HAOUARI Noureddine commited on
Commit
119e740
1 Parent(s): 922a6fe
Files changed (1) hide show
  1. app.py +43 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+
6
+ def convert_pdf_to_text(pdf_file):
7
+ print(pdf_file.name)
8
+ if not pdf_file.name.endswith(".pdf"):
9
+ raise ValueError("Invalid file format. Please upload PDF files only.")
10
+
11
+ text = "\n---\n"
12
+ text += f"file name: {pdf_file.name}\n content: \n"
13
+ with open(pdf_file.name, "rb") as file:
14
+ pdf_reader = PdfReader(file)
15
+ # Extract all text at once
16
+ text += "".join([page.extract_text() for page in pdf_reader.pages])
17
+ text += "\n---\n"
18
+ return text
19
+
20
+
21
+ def pdf_to_text(pdf_files):
22
+
23
+ # Create a ThreadPoolExecutor to run the conversion in parallel
24
+ with ThreadPoolExecutor() as executor:
25
+ # Use the executor to map the convert_pdf_to_text function over all the pdf_files
26
+ results = executor.map(convert_pdf_to_text, pdf_files)
27
+ # Concatenate the text from all the PDFs
28
+ text = "\n".join(results)
29
+
30
+ return text
31
+
32
+
33
+ iface = gr.Interface(
34
+ fn=pdf_to_text,
35
+ inputs=gr.inputs.File(
36
+ type="file", label="Upload a PDF file", file_count="multiple"),
37
+ outputs="text",
38
+ title="PDF to Text Converter",
39
+ description="Upload PDF files and get their content in text format.",
40
+ )
41
+
42
+ if __name__ == "__main__":
43
+ iface.launch()