Spaces:

zliang
/

fastpaperlayout

Sleeping

File size: 2,817 Bytes

f662962

import gradio as gr
from ultralytics import YOLO
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import cv2
import io

# Load the trained YOLOv8 model
model_path = 'best.pt'  # Replace with the path to your trained .pt file
model = YOLO(model_path)

# Function to extract images from PDF
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        for img_num, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            images.append(image)
    return images

# Placeholder function to extract tables (modify as needed)
def extract_tables_from_pdf(pdf_path):
    # Dummy implementation; replace with actual table extraction logic
    return ["Table extraction not implemented"]

# Function to perform inference on an image
def infer_image(image):
    # Convert the image to RGB (if not already in that format)
    image_rgb = np.array(image.convert('RGB'))
    
    # Perform inference
    results = model(image_rgb)
    
    # Annotate image
    annotated_image = np.array(image_rgb)
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = box.xyxy[0]
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            
            # Draw bounding box
            cv2.rectangle(annotated_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
            # Draw label
            label = f'{model.names[cls]} {conf:.2f}'
            cv2.putText(annotated_image, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    return annotated_image

# Gradio function to process PDF and return images and tables
def process_pdf(pdf):
    # Extract images and tables from PDF
    images = extract_images_from_pdf(pdf.name)
    tables = extract_tables_from_pdf(pdf.name)
    
    # Perform inference on extracted images
    annotated_images = [infer_image(img) for img in images]
    
    # Convert annotated images back to Image format for Gradio
    annotated_images_pil = [Image.fromarray(img) for img in annotated_images]
    
    # Return annotated images and tables
    return annotated_images_pil, tables

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf, 
    inputs=gr.File( label="Upload a PDF"),
    outputs=[
        gr.Gallery(label="Annotated Images"),
        gr.Textbox(label="Extracted Tables")
    ],
    title="PDF Image and Table Extraction with YOLOv8",
    description="Upload a PDF to extract and annotate images and tables using YOLOv8."
)

# Launch the app
iface.launch()