import gradio as gr from ultralytics import YOLO import fitz # PyMuPDF from PIL import Image import numpy as np import cv2 import io # Load the trained YOLOv8 model model_path = 'best.pt' # Replace with the path to your trained .pt file model = YOLO(model_path) # Function to extract images from PDF def extract_images_from_pdf(pdf_path): doc = fitz.open(pdf_path) images = [] for page_num in range(len(doc)): page = doc.load_page(page_num) for img_num, img in enumerate(page.get_images(full=True)): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image = Image.open(io.BytesIO(image_bytes)).convert("RGB") images.append(image) return images # Placeholder function to extract tables (modify as needed) def extract_tables_from_pdf(pdf_path): # Dummy implementation; replace with actual table extraction logic return ["Table extraction not implemented"] # Function to perform inference on an image def infer_image(image): # Convert the image to RGB (if not already in that format) image_rgb = np.array(image.convert('RGB')) # Perform inference results = model(image_rgb) # Annotate image annotated_image = np.array(image_rgb) for result in results: for box in result.boxes: x1, y1, x2, y2 = box.xyxy[0] cls = int(box.cls[0]) conf = float(box.conf[0]) # Draw bounding box cv2.rectangle(annotated_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) # Draw label label = f'{model.names[cls]} {conf:.2f}' cv2.putText(annotated_image, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) return annotated_image # Gradio function to process PDF and return images and tables def process_pdf(pdf): # Extract images and tables from PDF images = extract_images_from_pdf(pdf.name) tables = extract_tables_from_pdf(pdf.name) # Perform inference on extracted images annotated_images = [infer_image(img) for img in images] # Convert annotated images back to Image format for Gradio annotated_images_pil = [Image.fromarray(img) for img in annotated_images] # Return annotated images and tables return annotated_images_pil, tables # Create Gradio interface iface = gr.Interface( fn=process_pdf, inputs=gr.File( label="Upload a PDF"), outputs=[ gr.Gallery(label="Annotated Images"), gr.Textbox(label="Extracted Tables") ], title="PDF Image and Table Extraction with YOLOv8", description="Upload a PDF to extract and annotate images and tables using YOLOv8." ) # Launch the app iface.launch()