Spaces:

toandev
/

OCR-for-Captcha

Running

App Files Files Community

toandev commited on Aug 28, 2024

Commit

50d8f01

1 Parent(s): d5995c8

WIP

Browse files

Files changed (8) hide show

1.png +0 -0
2.jpg +0 -0
3.jpg +0 -0
4.png +0 -0
5.png +0 -0
app.py +83 -0
requirements.txt +5 -0
utils/tokenizer_base.py +132 -0

1.png ADDED Viewed

2.jpg ADDED Viewed

3.jpg ADDED Viewed

4.png ADDED Viewed

5.png ADDED Viewed

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import onnx
+import onnxruntime as rt
+from torchvision import transforms as T
+from pathlib import Path
+from PIL import Image
+from huggingface_hub import login, hf_hub_download
+import os
+import gradio as gr
+from utils.tokenizer_base import Tokenizer
+login(os.getenv("HF_TOKEN"))
+cwd = Path(__file__).parent.resolve()
+model_file = os.path.join(cwd, hf_hub_download("toandev/ocr-for-captcha", "model.onnx"))
+img_size = (32, 128)
+vocab = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+tokenizer = Tokenizer(vocab)
+def to_numpy(tensor):
+    return (
+        tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+    )
+def get_transform(img_size):
+    transforms = []
+    transforms.extend(
+        [
+            T.Resize(img_size, T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(0.5, 0.5),
+        ]
+    )
+    return T.Compose(transforms)
+def load_model(model_file):
+    transform = get_transform(img_size)
+    onnx_model = onnx.load(model_file)
+    onnx.checker.check_model(onnx_model)
+    s = rt.InferenceSession(model_file)
+    return transform, s
+transform, s = load_model(model_file=model_file)
+def infer(img: Image.Image):
+    x = transform(img.convert("RGB")).unsqueeze(0)
+    ort_inputs = {s.get_inputs()[0].name: to_numpy(x)}
+    logits = s.run(None, ort_inputs)[0]
+    probs = torch.tensor(logits).softmax(-1)
+    preds, probs = tokenizer.decode(probs)
+    return preds[0]
+demo = gr.Interface(
+    infer,
+    gr.components.Image(type="pil"),
+    gr.components.Textbox(),
+    title="OCR for CAPTCHA",
+    description="Solve captchas from images including letters and numbers, success rate is about 80-90%.",
+    examples=[
+        "1.png",
+        "2.jpg",
+        "3.jpg",
+        "4.png",
+        "5.png",
+    ],
+)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+torchvision
+onnx
+onnxruntime
+Pillow

utils/tokenizer_base.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import re
+from abc import ABC, abstractmethod
+from itertools import groupby
+from typing import List, Optional, Tuple
+import torch
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+class CharsetAdapter:
+    """Transforms labels according to the target charset."""
+    def __init__(self, target_charset) -> None:
+        super().__init__()
+        self.charset = target_charset ###
+        self.lowercase_only = target_charset == target_charset.lower()
+        self.uppercase_only = target_charset == target_charset.upper()
+#         self.unsupported = f'[^{re.escape(target_charset)}]'
+    def __call__(self, label):
+        if self.lowercase_only:
+            label = label.lower()
+        elif self.uppercase_only:
+            label = label.upper()
+        return label
+class BaseTokenizer(ABC):
+    def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
+        self._itos = specials_first + tuple(charset+'[UNK]') + specials_last
+        self._stoi = {s: i for i, s in enumerate(self._itos)}
+    def __len__(self):
+        return len(self._itos)
+    def _tok2ids(self, tokens: str) -> List[int]:
+        return [self._stoi[s] for s in tokens]
+    def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
+        tokens = [self._itos[i] for i in token_ids]
+        return ''.join(tokens) if join else tokens
+    @abstractmethod
+    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
+        """Encode a batch of labels to a representation suitable for the model.
+        Args:
+            labels: List of labels. Each can be of arbitrary length.
+            device: Create tensor on this device.
+        Returns:
+            Batched tensor representation padded to the max label length. Shape: N, L
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
+        """Internal method which performs the necessary filtering prior to decoding."""
+        raise NotImplementedError
+    def decode(self, token_dists: Tensor, raw: bool = False) -> Tuple[List[str], List[Tensor]]:
+        """Decode a batch of token distributions.
+        Args:
+            token_dists: softmax probabilities over the token distribution. Shape: N, L, C
+            raw: return unprocessed labels (will return list of list of strings)
+        Returns:
+            list of string labels (arbitrary length) and
+            their corresponding sequence probabilities as a list of Tensors
+        """
+        batch_tokens = []
+        batch_probs = []
+        for dist in token_dists:
+            probs, ids = dist.max(-1)  # greedy selection
+            if not raw:
+                probs, ids = self._filter(probs, ids)
+            tokens = self._ids2tok(ids, not raw)
+            batch_tokens.append(tokens)
+            batch_probs.append(probs)
+        return batch_tokens, batch_probs
+class Tokenizer(BaseTokenizer):
+    BOS = '[B]'
+    EOS = '[E]'
+    PAD = '[P]'
+    def __init__(self, charset: str) -> None:
+        specials_first = (self.EOS,)
+        specials_last = (self.BOS, self.PAD)
+        super().__init__(charset, specials_first, specials_last)
+        self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
+    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
+        batch = [torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
+                 for y in labels]
+        return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
+    def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
+        ids = ids.tolist()
+        try:
+            eos_idx = ids.index(self.eos_id)
+        except ValueError:
+            eos_idx = len(ids)  # Nothing to truncate.
+        # Truncate after EOS
+        ids = ids[:eos_idx]
+        probs = probs[:eos_idx + 1]  # but include prob. for EOS (if it exists)
+        return probs, ids
+class CTCTokenizer(BaseTokenizer):
+    BLANK = '[B]'
+    def __init__(self, charset: str) -> None:
+        # BLANK uses index == 0 by default
+        super().__init__(charset, specials_first=(self.BLANK,))
+        self.blank_id = self._stoi[self.BLANK]
+    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
+        # We use a padded representation since we don't want to use CUDNN's CTC implementation
+        batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
+        return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
+    def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
+        # Best path decoding:
+        ids = list(zip(*groupby(ids.tolist())))[0]  # Remove duplicate tokens
+        ids = [x for x in ids if x != self.blank_id]  # Remove BLANKs
+        # `probs` is just pass-through since all positions are considered part of the path
+        return probs, ids