Steven C
commited on
Commit
•
c825110
0
Parent(s):
init commit
Browse files- .gitattributes +35 -0
- .gitignore +1 -0
- app.py +69 -0
- requirements.txt +6 -0
- secret_models +1 -0
- tokenizer_base.py +132 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
app.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import onnx
|
3 |
+
import onnxruntime as rt
|
4 |
+
from torchvision import transforms as T
|
5 |
+
from tokenizer_base import Tokenizer
|
6 |
+
import pathlib
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
from PIL import Image
|
10 |
+
|
11 |
+
from huggingface_hub import Repository
|
12 |
+
|
13 |
+
repo = Repository(
|
14 |
+
local_dir="secret_models",
|
15 |
+
repo_type="model",
|
16 |
+
clone_from="docparser/captcha",
|
17 |
+
token=True
|
18 |
+
)
|
19 |
+
repo.git_pull()
|
20 |
+
|
21 |
+
cwd = pathlib.Path(__file__).parent.resolve()
|
22 |
+
img_size = (32, 128)
|
23 |
+
charset = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
|
24 |
+
tokenizer_base = Tokenizer(charset)
|
25 |
+
|
26 |
+
|
27 |
+
def get_transform(img_size):
|
28 |
+
transforms = []
|
29 |
+
transforms.extend([
|
30 |
+
T.Resize(img_size, T.InterpolationMode.BICUBIC),
|
31 |
+
T.ToTensor(),
|
32 |
+
T.Normalize(0.5, 0.5)
|
33 |
+
])
|
34 |
+
return T.Compose(transforms)
|
35 |
+
|
36 |
+
|
37 |
+
def to_numpy(tensor):
|
38 |
+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
|
39 |
+
|
40 |
+
|
41 |
+
def initialize_model(model_file):
|
42 |
+
transform = get_transform(img_size)
|
43 |
+
onnx_model = onnx.load(model_file)
|
44 |
+
onnx.checker.check_model(onnx_model)
|
45 |
+
ort_session = rt.InferenceSession(model_file)
|
46 |
+
return transform, ort_session
|
47 |
+
|
48 |
+
|
49 |
+
def get_text(image_path):
|
50 |
+
img_org = Image.open(image_path)
|
51 |
+
# Preprocess. Model expects a batch of images with shape: (B, C, H, W)
|
52 |
+
x = transform(img_org.convert('RGB')).unsqueeze(0)
|
53 |
+
|
54 |
+
# compute ONNX Runtime output prediction
|
55 |
+
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
|
56 |
+
logits = ort_session.run(None, ort_inputs)[0]
|
57 |
+
probs = torch.tensor(logits).softmax(-1)
|
58 |
+
preds, probs = tokenizer_base.decode(probs)
|
59 |
+
preds = preds[0]
|
60 |
+
return preds
|
61 |
+
|
62 |
+
|
63 |
+
model_file = os.path.join(cwd, "secret_models", "captcha.onnx")
|
64 |
+
transform, ort_session = initialize_model(model_file=model_file)
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
image_path = sys.argv[1]
|
68 |
+
res = get_text(image_path)
|
69 |
+
print(res)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.11.0
|
2 |
+
torchvision==0.12.0
|
3 |
+
onnx==1.14.0
|
4 |
+
onnxruntime==1.15.1
|
5 |
+
Pillow==10.0.0
|
6 |
+
huggingface-hub==0.21.4
|
secret_models
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 99c2c60ac74ca2b4a90613ccec0685e416dff0a8
|
tokenizer_base.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from itertools import groupby
|
4 |
+
from typing import List, Optional, Tuple
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from torch import Tensor
|
8 |
+
from torch.nn.utils.rnn import pad_sequence
|
9 |
+
|
10 |
+
|
11 |
+
class CharsetAdapter:
|
12 |
+
"""Transforms labels according to the target charset."""
|
13 |
+
|
14 |
+
def __init__(self, target_charset) -> None:
|
15 |
+
super().__init__()
|
16 |
+
self.charset = target_charset ###
|
17 |
+
self.lowercase_only = target_charset == target_charset.lower()
|
18 |
+
self.uppercase_only = target_charset == target_charset.upper()
|
19 |
+
# self.unsupported = f'[^{re.escape(target_charset)}]'
|
20 |
+
|
21 |
+
def __call__(self, label):
|
22 |
+
if self.lowercase_only:
|
23 |
+
label = label.lower()
|
24 |
+
elif self.uppercase_only:
|
25 |
+
label = label.upper()
|
26 |
+
return label
|
27 |
+
|
28 |
+
|
29 |
+
class BaseTokenizer(ABC):
|
30 |
+
|
31 |
+
def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
|
32 |
+
self._itos = specials_first + tuple(charset+'[UNK]') + specials_last
|
33 |
+
self._stoi = {s: i for i, s in enumerate(self._itos)}
|
34 |
+
|
35 |
+
def __len__(self):
|
36 |
+
return len(self._itos)
|
37 |
+
|
38 |
+
def _tok2ids(self, tokens: str) -> List[int]:
|
39 |
+
return [self._stoi[s] for s in tokens]
|
40 |
+
|
41 |
+
def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
|
42 |
+
tokens = [self._itos[i] for i in token_ids]
|
43 |
+
return ''.join(tokens) if join else tokens
|
44 |
+
|
45 |
+
@abstractmethod
|
46 |
+
def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
|
47 |
+
"""Encode a batch of labels to a representation suitable for the model.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
labels: List of labels. Each can be of arbitrary length.
|
51 |
+
device: Create tensor on this device.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
Batched tensor representation padded to the max label length. Shape: N, L
|
55 |
+
"""
|
56 |
+
raise NotImplementedError
|
57 |
+
|
58 |
+
@abstractmethod
|
59 |
+
def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
|
60 |
+
"""Internal method which performs the necessary filtering prior to decoding."""
|
61 |
+
raise NotImplementedError
|
62 |
+
|
63 |
+
def decode(self, token_dists: Tensor, raw: bool = False) -> Tuple[List[str], List[Tensor]]:
|
64 |
+
"""Decode a batch of token distributions.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
token_dists: softmax probabilities over the token distribution. Shape: N, L, C
|
68 |
+
raw: return unprocessed labels (will return list of list of strings)
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
list of string labels (arbitrary length) and
|
72 |
+
their corresponding sequence probabilities as a list of Tensors
|
73 |
+
"""
|
74 |
+
batch_tokens = []
|
75 |
+
batch_probs = []
|
76 |
+
for dist in token_dists:
|
77 |
+
probs, ids = dist.max(-1) # greedy selection
|
78 |
+
if not raw:
|
79 |
+
probs, ids = self._filter(probs, ids)
|
80 |
+
tokens = self._ids2tok(ids, not raw)
|
81 |
+
batch_tokens.append(tokens)
|
82 |
+
batch_probs.append(probs)
|
83 |
+
return batch_tokens, batch_probs
|
84 |
+
|
85 |
+
|
86 |
+
class Tokenizer(BaseTokenizer):
|
87 |
+
BOS = '[B]'
|
88 |
+
EOS = '[E]'
|
89 |
+
PAD = '[P]'
|
90 |
+
|
91 |
+
def __init__(self, charset: str) -> None:
|
92 |
+
specials_first = (self.EOS,)
|
93 |
+
specials_last = (self.BOS, self.PAD)
|
94 |
+
super().__init__(charset, specials_first, specials_last)
|
95 |
+
self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
|
96 |
+
|
97 |
+
def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
|
98 |
+
batch = [torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
|
99 |
+
for y in labels]
|
100 |
+
return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
|
101 |
+
|
102 |
+
def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
|
103 |
+
ids = ids.tolist()
|
104 |
+
try:
|
105 |
+
eos_idx = ids.index(self.eos_id)
|
106 |
+
except ValueError:
|
107 |
+
eos_idx = len(ids) # Nothing to truncate.
|
108 |
+
# Truncate after EOS
|
109 |
+
ids = ids[:eos_idx]
|
110 |
+
probs = probs[:eos_idx + 1] # but include prob. for EOS (if it exists)
|
111 |
+
return probs, ids
|
112 |
+
|
113 |
+
|
114 |
+
class CTCTokenizer(BaseTokenizer):
|
115 |
+
BLANK = '[B]'
|
116 |
+
|
117 |
+
def __init__(self, charset: str) -> None:
|
118 |
+
# BLANK uses index == 0 by default
|
119 |
+
super().__init__(charset, specials_first=(self.BLANK,))
|
120 |
+
self.blank_id = self._stoi[self.BLANK]
|
121 |
+
|
122 |
+
def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
|
123 |
+
# We use a padded representation since we don't want to use CUDNN's CTC implementation
|
124 |
+
batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
|
125 |
+
return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
|
126 |
+
|
127 |
+
def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
|
128 |
+
# Best path decoding:
|
129 |
+
ids = list(zip(*groupby(ids.tolist())))[0] # Remove duplicate tokens
|
130 |
+
ids = [x for x in ids if x != self.blank_id] # Remove BLANKs
|
131 |
+
# `probs` is just pass-through since all positions are considered part of the path
|
132 |
+
return probs, ids
|