import atexit import base64 import os import shutil import tempfile import time import fitz import gradio as gr import spaces import torch import transformers from PIL import Image, ImageEnhance from transformers import AutoModel, AutoTokenizer transformers.utils.move_cache() transformers.logging.set_verbosity_error() model_name = "ucaslcl/GOT-OCR2_0" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map=device) model = model.eval().to(device) # 创建一个持久的临时目录 TEMP_DIR = tempfile.mkdtemp() def cleanup(): """清理临时目录""" shutil.rmtree(TEMP_DIR, ignore_errors=True) # 确保在程序退出时清理临时目录 atexit.register(cleanup) def pdf_to_images(pdf_path): images = [] pdf_document = fitz.open(pdf_path) for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) zoom = 10 # 增加缩放比例到10 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # 增对比度 enhancer = ImageEnhance.Contrast(img) img = enhancer.enhance(1.5) # 增加50%的对比度 images.append(img) pdf_document.close() return images @spaces.GPU() def convert_pdf_to_images(file): if file is None: return "错误:未提供文件", None try: if not file.name.lower().endswith(".pdf"): return "错误:请上传PDF文件", None images = pdf_to_images(file.name) image_paths = [] for i, image in enumerate(images): img_path = os.path.join(TEMP_DIR, f"page_{i+1}.png") image.save(img_path, "PNG") image_paths.append(img_path) return "PDF转换为图片成功", image_paths except Exception as e: return f"错误: {str(e)}", None @spaces.GPU() def ocr_process(image, got_mode, ocr_color="", ocr_box="", progress=gr.Progress()): if image is None: return "错误:未选择图片" try: progress(0, desc="开始处理...") # 模拟OCR处理的不同阶段 progress(0.2, desc="图像预处理...") time.sleep(0.5) progress(0.4, desc="文字识别中...") time.sleep(0.5) progress(0.6, desc="后处理...") time.sleep(0.5) result = process_single_image(image, got_mode, ocr_color, ocr_box) progress(0.8, desc="生成结果...") time.sleep(0.5) progress(1, desc="处理完成") return result except Exception as e: return f"错误: {str(e)}" def process_single_image(image_path, got_mode, ocr_color, ocr_box): result_path = f"{os.path.splitext(image_path)[0]}_result.html" if "plain" in got_mode: if "multi-crop" in got_mode: res = model.chat_crop(tokenizer, image_path, ocr_type="ocr") else: res = model.chat(tokenizer, image_path, ocr_type="ocr", ocr_box=ocr_box, ocr_color=ocr_color) return res elif "format" in got_mode: if "multi-crop" in got_mode: res = model.chat_crop(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path) else: res = model.chat(tokenizer, image_path, ocr_type="format", ocr_box=ocr_box, ocr_color=ocr_color, render=True, save_render_file=result_path) if os.path.exists(result_path): with open(result_path, "r", encoding="utf-8") as f: html_content = f.read() encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8") data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}" preview = f'' download_link = f'下载完整结果' return f"{download_link}\n\n{preview}" return "错误: 未知的OCR模式" with gr.Blocks() as demo: gr.Markdown("# OCR 图像识别") with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File(label="上传PDF文件") convert_button = gr.Button("转换PDF为图片") with gr.Column(scale=2): image_gallery = gr.Gallery(label="图片预览", columns=3) with gr.Row(): with gr.Column(scale=1): selected_image = gr.State(value=None) # 使用 gr.State 来存储选中的图片路径 preview_image = gr.Image(label="选中的图片", type="filepath") got_mode = gr.Dropdown( choices=[ "plain texts OCR", "format texts OCR", "plain multi-crop OCR", "format multi-crop OCR", "plain fine-grained OCR", "format fine-grained OCR", ], label="OCR模式", value="plain texts OCR", ) ocr_color = gr.Textbox(label="OCR颜色 (仅用于fine-grained模式)") ocr_box = gr.Textbox(label="OCR边界框 (仅用于fine-grained模式)") ocr_button = gr.Button("开始OCR识别") with gr.Column(scale=2): ocr_output = gr.HTML(label="识别结果") def select_image(evt: gr.SelectData, gallery): selected = gallery[evt.index] return selected image_gallery.select(select_image, image_gallery, selected_image) convert_button.click(convert_pdf_to_images, inputs=[pdf_input], outputs=[gr.Textbox(visible=False), image_gallery]) ocr_button.click(ocr_process, inputs=[selected_image, got_mode, ocr_color, ocr_box], outputs=ocr_output) if __name__ == "__main__": demo.launch()