Spaces:
Runtime error
Runtime error
File size: 3,063 Bytes
dd1cb9c 1f0ed21 6c400a9 9a1c39c efce880 6c400a9 efce880 dd1cb9c 6c400a9 9a1c39c efce880 a954cfa 7ca6619 efce880 9a1c39c efce880 9a1c39c efce880 dd1cb9c 6c400a9 9a1c39c 15d68b8 6c400a9 9a1c39c 6c400a9 4013f70 6c400a9 9a1c39c 6c400a9 9a1c39c 6c400a9 9a1c39c dd1cb9c 1cf5a2d 9a1c39c 4013f70 dd1cb9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
import spaces
import subprocess
import os
import shutil
import string
import random
from pypdf import PdfReader
import ocrmypdf
def random_word(length):
letters = string.ascii_lowercase
return "".join(random.choice(letters) for _ in range(length))
def convert_pdf(input_file):
reader = PdfReader(input_file)
metadata = extract_metadata_from_pdf(reader)
text = extract_text_from_pdf(reader)
# Check if there are any images
image_count = 0
for page in reader.pages:
image_count += len(page.images)
# If there are images and not much content, perform OCR on the document
if image_count > 0 and len(text) < 1000:
out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)
# Re-extract text
text = extract_text_from_pdf(PdfReader(input_file))
# Delete the OCR file
os.remove(out_pdf_file)
return text, metadata
def extract_text_from_pdf(reader):
full_text = ""
for idx, page in enumerate(reader.pages):
text = page.extract_text()
if len(text) > 0:
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
return full_text.strip()
def extract_metadata_from_pdf(reader):
return {
"author": reader.metadata.author,
"creator": reader.metadata.creator,
"producer": reader.metadata.producer,
"subject": reader.metadata.subject,
"title": reader.metadata.title,
}
def convert_pandoc(input_file, filename):
# Temporarily copy the file
shutil.copyfile(input_file, filename)
# Convert the file to markdown with pandoc
output_file = f"{random_word(16)}.md"
result = subprocess.call(["pandoc", filename, "-t", "markdown", "-o", output_file])
if result != 0:
raise ValueError("Error converting file to markdown with pandoc")
# Read the file and delete temporary files
with open(output_file, "r") as f:
markdown = f.read()
os.remove(output_file)
os.remove(filename)
return markdown
@spaces.GPU
def convert(input_file, filename):
plain_text_filetypes = [
".txt",
".csv",
".tsv",
".md",
".yaml",
".toml",
".json",
".json5",
".jsonc",
]
# Already a plain text file that wouldn't benefit from pandoc so return the content
if any(filename.endswith(ft) for ft in plain_text_filetypes):
with open(input_file, "r") as f:
return f.read(), {}
if filename.endswith(".pdf"):
return convert_pdf(input_file)
return convert_pandoc(input_file, filename), {}
# We accept a filename because the gradio JS interface removes this information
# and it's critical for choosing the correct processing pipeline
gr.Interface(
convert,
inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()
|