import torch import onnx import onnxruntime as rt from torchvision import transforms as T from pathlib import Path from PIL import Image from huggingface_hub import login, hf_hub_download import os import gradio as gr from utils.tokenizer_base import Tokenizer login(os.getenv("HF_TOKEN")) cwd = Path(__file__).parent.resolve() model_file = os.path.join(cwd, hf_hub_download("toandev/ocr-for-captcha", "model.onnx")) img_size = (32, 128) vocab = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" tokenizer = Tokenizer(vocab) def to_numpy(tensor): return ( tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() ) def get_transform(img_size): transforms = [] transforms.extend( [ T.Resize(img_size, T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(0.5, 0.5), ] ) return T.Compose(transforms) def load_model(model_file): transform = get_transform(img_size) onnx_model = onnx.load(model_file) onnx.checker.check_model(onnx_model) s = rt.InferenceSession(model_file) return transform, s transform, s = load_model(model_file=model_file) def infer(img: Image.Image): x = transform(img.convert("RGB")).unsqueeze(0) ort_inputs = {s.get_inputs()[0].name: to_numpy(x)} logits = s.run(None, ort_inputs)[0] probs = torch.tensor(logits).softmax(-1) preds, probs = tokenizer.decode(probs) return preds[0] demo = gr.Interface( infer, gr.components.Image(type="pil"), gr.components.Textbox(), title="OCR for CAPTCHA", description="Solve captchas from images including letters and numbers, success rate is about 80-90%.", examples=[ "1.png", "2.jpg", "3.jpg", "4.png", "5.png", ], ) demo.launch()