ketanmore's picture
Upload folder using huggingface_hub
2720487 verified
raw
history blame
2.27 kB
import PIL
from surya.input.processing import open_pdf, get_page_images
import os
import filetype
from PIL import Image
import json
def get_name_from_path(path):
return os.path.basename(path).split(".")[0]
def load_pdf(pdf_path, max_pages=None, start_page=None):
doc = open_pdf(pdf_path)
last_page = len(doc)
if start_page:
assert start_page < last_page and start_page >= 0, f"Start page must be between 0 and {last_page}"
else:
start_page = 0
if max_pages:
assert max_pages >= 0, f"Max pages must be greater than 0"
last_page = min(start_page + max_pages, last_page)
page_indices = list(range(start_page, last_page))
images = get_page_images(doc, page_indices)
doc.close()
names = [get_name_from_path(pdf_path) for _ in page_indices]
return images, names
def load_image(image_path):
image = Image.open(image_path).convert("RGB")
name = get_name_from_path(image_path)
return [image], [name]
def load_from_file(input_path, max_pages=None, start_page=None):
input_type = filetype.guess(input_path)
if input_type.extension == "pdf":
return load_pdf(input_path, max_pages, start_page)
else:
return load_image(input_path)
def load_from_folder(folder_path, max_pages=None, start_page=None):
image_paths = [os.path.join(folder_path, image_name) for image_name in os.listdir(folder_path) if not image_name.startswith(".")]
image_paths = [ip for ip in image_paths if not os.path.isdir(ip)]
images = []
names = []
for path in image_paths:
extension = filetype.guess(path)
if extension and extension.extension == "pdf":
image, name = load_pdf(path, max_pages, start_page)
images.extend(image)
names.extend(name)
else:
try:
image, name = load_image(path)
images.extend(image)
names.extend(name)
except PIL.UnidentifiedImageError:
print(f"Could not load image {path}")
continue
return images, names
def load_lang_file(lang_path, names):
with open(lang_path, "r") as f:
lang_dict = json.load(f)
return [lang_dict[name].copy() for name in names]