toandev commited on
Commit
50d8f01
·
1 Parent(s): d5995c8
Files changed (8) hide show
  1. 1.png +0 -0
  2. 2.jpg +0 -0
  3. 3.jpg +0 -0
  4. 4.png +0 -0
  5. 5.png +0 -0
  6. app.py +83 -0
  7. requirements.txt +5 -0
  8. utils/tokenizer_base.py +132 -0
1.png ADDED
2.jpg ADDED
3.jpg ADDED
4.png ADDED
5.png ADDED
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import onnx
3
+ import onnxruntime as rt
4
+ from torchvision import transforms as T
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ from huggingface_hub import login, hf_hub_download
8
+
9
+ import os
10
+ import gradio as gr
11
+
12
+ from utils.tokenizer_base import Tokenizer
13
+
14
+
15
+ login(os.getenv("HF_TOKEN"))
16
+
17
+ cwd = Path(__file__).parent.resolve()
18
+ model_file = os.path.join(cwd, hf_hub_download("toandev/ocr-for-captcha", "model.onnx"))
19
+
20
+ img_size = (32, 128)
21
+
22
+ vocab = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
23
+ tokenizer = Tokenizer(vocab)
24
+
25
+
26
+ def to_numpy(tensor):
27
+ return (
28
+ tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
29
+ )
30
+
31
+
32
+ def get_transform(img_size):
33
+ transforms = []
34
+ transforms.extend(
35
+ [
36
+ T.Resize(img_size, T.InterpolationMode.BICUBIC),
37
+ T.ToTensor(),
38
+ T.Normalize(0.5, 0.5),
39
+ ]
40
+ )
41
+ return T.Compose(transforms)
42
+
43
+
44
+ def load_model(model_file):
45
+ transform = get_transform(img_size)
46
+
47
+ onnx_model = onnx.load(model_file)
48
+ onnx.checker.check_model(onnx_model)
49
+
50
+ s = rt.InferenceSession(model_file)
51
+ return transform, s
52
+
53
+
54
+ transform, s = load_model(model_file=model_file)
55
+
56
+
57
+ def infer(img: Image.Image):
58
+ x = transform(img.convert("RGB")).unsqueeze(0)
59
+
60
+ ort_inputs = {s.get_inputs()[0].name: to_numpy(x)}
61
+ logits = s.run(None, ort_inputs)[0]
62
+ probs = torch.tensor(logits).softmax(-1)
63
+ preds, probs = tokenizer.decode(probs)
64
+
65
+ return preds[0]
66
+
67
+
68
+ demo = gr.Interface(
69
+ infer,
70
+ gr.components.Image(type="pil"),
71
+ gr.components.Textbox(),
72
+ title="OCR for CAPTCHA",
73
+ description="Solve captchas from images including letters and numbers, success rate is about 80-90%.",
74
+ examples=[
75
+ "1.png",
76
+ "2.jpg",
77
+ "3.jpg",
78
+ "4.png",
79
+ "5.png",
80
+ ],
81
+ )
82
+
83
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ onnx
4
+ onnxruntime
5
+ Pillow
utils/tokenizer_base.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from itertools import groupby
4
+ from typing import List, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import Tensor
8
+ from torch.nn.utils.rnn import pad_sequence
9
+
10
+
11
+ class CharsetAdapter:
12
+ """Transforms labels according to the target charset."""
13
+
14
+ def __init__(self, target_charset) -> None:
15
+ super().__init__()
16
+ self.charset = target_charset ###
17
+ self.lowercase_only = target_charset == target_charset.lower()
18
+ self.uppercase_only = target_charset == target_charset.upper()
19
+ # self.unsupported = f'[^{re.escape(target_charset)}]'
20
+
21
+ def __call__(self, label):
22
+ if self.lowercase_only:
23
+ label = label.lower()
24
+ elif self.uppercase_only:
25
+ label = label.upper()
26
+ return label
27
+
28
+
29
+ class BaseTokenizer(ABC):
30
+
31
+ def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
32
+ self._itos = specials_first + tuple(charset+'[UNK]') + specials_last
33
+ self._stoi = {s: i for i, s in enumerate(self._itos)}
34
+
35
+ def __len__(self):
36
+ return len(self._itos)
37
+
38
+ def _tok2ids(self, tokens: str) -> List[int]:
39
+ return [self._stoi[s] for s in tokens]
40
+
41
+ def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
42
+ tokens = [self._itos[i] for i in token_ids]
43
+ return ''.join(tokens) if join else tokens
44
+
45
+ @abstractmethod
46
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
47
+ """Encode a batch of labels to a representation suitable for the model.
48
+
49
+ Args:
50
+ labels: List of labels. Each can be of arbitrary length.
51
+ device: Create tensor on this device.
52
+
53
+ Returns:
54
+ Batched tensor representation padded to the max label length. Shape: N, L
55
+ """
56
+ raise NotImplementedError
57
+
58
+ @abstractmethod
59
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
60
+ """Internal method which performs the necessary filtering prior to decoding."""
61
+ raise NotImplementedError
62
+
63
+ def decode(self, token_dists: Tensor, raw: bool = False) -> Tuple[List[str], List[Tensor]]:
64
+ """Decode a batch of token distributions.
65
+
66
+ Args:
67
+ token_dists: softmax probabilities over the token distribution. Shape: N, L, C
68
+ raw: return unprocessed labels (will return list of list of strings)
69
+
70
+ Returns:
71
+ list of string labels (arbitrary length) and
72
+ their corresponding sequence probabilities as a list of Tensors
73
+ """
74
+ batch_tokens = []
75
+ batch_probs = []
76
+ for dist in token_dists:
77
+ probs, ids = dist.max(-1) # greedy selection
78
+ if not raw:
79
+ probs, ids = self._filter(probs, ids)
80
+ tokens = self._ids2tok(ids, not raw)
81
+ batch_tokens.append(tokens)
82
+ batch_probs.append(probs)
83
+ return batch_tokens, batch_probs
84
+
85
+
86
+ class Tokenizer(BaseTokenizer):
87
+ BOS = '[B]'
88
+ EOS = '[E]'
89
+ PAD = '[P]'
90
+
91
+ def __init__(self, charset: str) -> None:
92
+ specials_first = (self.EOS,)
93
+ specials_last = (self.BOS, self.PAD)
94
+ super().__init__(charset, specials_first, specials_last)
95
+ self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
96
+
97
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
98
+ batch = [torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
99
+ for y in labels]
100
+ return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
101
+
102
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
103
+ ids = ids.tolist()
104
+ try:
105
+ eos_idx = ids.index(self.eos_id)
106
+ except ValueError:
107
+ eos_idx = len(ids) # Nothing to truncate.
108
+ # Truncate after EOS
109
+ ids = ids[:eos_idx]
110
+ probs = probs[:eos_idx + 1] # but include prob. for EOS (if it exists)
111
+ return probs, ids
112
+
113
+
114
+ class CTCTokenizer(BaseTokenizer):
115
+ BLANK = '[B]'
116
+
117
+ def __init__(self, charset: str) -> None:
118
+ # BLANK uses index == 0 by default
119
+ super().__init__(charset, specials_first=(self.BLANK,))
120
+ self.blank_id = self._stoi[self.BLANK]
121
+
122
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
123
+ # We use a padded representation since we don't want to use CUDNN's CTC implementation
124
+ batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
125
+ return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
126
+
127
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
128
+ # Best path decoding:
129
+ ids = list(zip(*groupby(ids.tolist())))[0] # Remove duplicate tokens
130
+ ids = [x for x in ids if x != self.blank_id] # Remove BLANKs
131
+ # `probs` is just pass-through since all positions are considered part of the path
132
+ return probs, ids