stevenchang
/

text_captcha_breaker

Inference Endpoints

Model card Files Files and versions Community

Steven C commited on Apr 25, 2024

Commit

37a77f2

•

1 Parent(s): f24f2e7

Format tokenizer_base

Browse files

Files changed (1) hide show

tokenizer_base.py +41 -21

tokenizer_base.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import re
 from abc import ABC, abstractmethod
 from itertools import groupby
 from typing import List, Optional, Tuple
@@ -13,10 +12,9 @@ class CharsetAdapter:
     def __init__(self, target_charset) -> None:
         super().__init__()
-        self.charset = target_charset ###
         self.lowercase_only = target_charset == target_charset.lower()
         self.uppercase_only = target_charset == target_charset.upper()
-#         self.unsupported = f'[^{re.escape(target_charset)}]'
     def __call__(self, label):
         if self.lowercase_only:
@@ -28,8 +26,10 @@ class CharsetAdapter:
 class BaseTokenizer(ABC):
-    def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
-        self._itos = specials_first + tuple(charset+'[UNK]') + specials_last
         self._stoi = {s: i for i, s in enumerate(self._itos)}
     def __len__(self):
@@ -40,10 +40,12 @@ class BaseTokenizer(ABC):
     def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
         tokens = [self._itos[i] for i in token_ids]
-        return ''.join(tokens) if join else tokens
     @abstractmethod
-    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
         """Encode a batch of labels to a representation suitable for the model.
         Args:
@@ -60,7 +62,9 @@ class BaseTokenizer(ABC):
         """Internal method which performs the necessary filtering prior to decoding."""
         raise NotImplementedError
-    def decode(self, token_dists: Tensor, raw: bool = False) -> Tuple[List[str], List[Tensor]]:
         """Decode a batch of token distributions.
         Args:
@@ -84,19 +88,29 @@ class BaseTokenizer(ABC):
 class Tokenizer(BaseTokenizer):
-    BOS = '[B]'
-    EOS = '[E]'
-    PAD = '[P]'
     def __init__(self, charset: str) -> None:
         specials_first = (self.EOS,)
         specials_last = (self.BOS, self.PAD)
         super().__init__(charset, specials_first, specials_last)
-        self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
-    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
-        batch = [torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
-                 for y in labels]
         return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
     def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
@@ -107,21 +121,27 @@ class Tokenizer(BaseTokenizer):
             eos_idx = len(ids)  # Nothing to truncate.
         # Truncate after EOS
         ids = ids[:eos_idx]
-        probs = probs[:eos_idx + 1]  # but include prob. for EOS (if it exists)
         return probs, ids
 class CTCTokenizer(BaseTokenizer):
-    BLANK = '[B]'
     def __init__(self, charset: str) -> None:
         # BLANK uses index == 0 by default
         super().__init__(charset, specials_first=(self.BLANK,))
         self.blank_id = self._stoi[self.BLANK]
-    def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
         # We use a padded representation since we don't want to use CUDNN's CTC implementation
-        batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
         return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
     def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
@@ -129,4 +149,4 @@ class CTCTokenizer(BaseTokenizer):
         ids = list(zip(*groupby(ids.tolist())))[0]  # Remove duplicate tokens
         ids = [x for x in ids if x != self.blank_id]  # Remove BLANKs
         # `probs` is just pass-through since all positions are considered part of the path
-        return probs, ids

 from abc import ABC, abstractmethod
 from itertools import groupby
 from typing import List, Optional, Tuple
     def __init__(self, target_charset) -> None:
         super().__init__()
+        self.charset = target_charset
         self.lowercase_only = target_charset == target_charset.lower()
         self.uppercase_only = target_charset == target_charset.upper()
     def __call__(self, label):
         if self.lowercase_only:
 class BaseTokenizer(ABC):
+    def __init__(
+        self, charset: str, specials_first: tuple = (), specials_last: tuple = ()
+    ) -> None:
+        self._itos = specials_first + tuple(charset + "[UNK]") + specials_last
         self._stoi = {s: i for i, s in enumerate(self._itos)}
     def __len__(self):
     def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
         tokens = [self._itos[i] for i in token_ids]
+        return "".join(tokens) if join else tokens
     @abstractmethod
+    def encode(
+        self, labels: List[str], device: Optional[torch.device] = None
+    ) -> Tensor:
         """Encode a batch of labels to a representation suitable for the model.
         Args:
         """Internal method which performs the necessary filtering prior to decoding."""
         raise NotImplementedError
+    def decode(
+        self, token_dists: Tensor, raw: bool = False
+    ) -> Tuple[List[str], List[Tensor]]:
         """Decode a batch of token distributions.
         Args:
 class Tokenizer(BaseTokenizer):
+    BOS = "[B]"
+    EOS = "[E]"
+    PAD = "[P]"
     def __init__(self, charset: str) -> None:
         specials_first = (self.EOS,)
         specials_last = (self.BOS, self.PAD)
         super().__init__(charset, specials_first, specials_last)
+        self.eos_id, self.bos_id, self.pad_id = [
+            self._stoi[s] for s in specials_first + specials_last
+        ]
+    def encode(
+        self, labels: List[str], device: Optional[torch.device] = None
+    ) -> Tensor:
+        batch = [
+            torch.as_tensor(
+                [self.bos_id] + self._tok2ids(y) + [self.eos_id],
+                dtype=torch.long,
+                device=device,
+            )
+            for y in labels
+        ]
         return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
     def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
             eos_idx = len(ids)  # Nothing to truncate.
         # Truncate after EOS
         ids = ids[:eos_idx]
+        # but include prob. for EOS (if it exists)
+        probs = probs[: eos_idx + 1]
         return probs, ids
 class CTCTokenizer(BaseTokenizer):
+    BLANK = "[B]"
     def __init__(self, charset: str) -> None:
         # BLANK uses index == 0 by default
         super().__init__(charset, specials_first=(self.BLANK,))
         self.blank_id = self._stoi[self.BLANK]
+    def encode(
+        self, labels: List[str], device: Optional[torch.device] = None
+    ) -> Tensor:
         # We use a padded representation since we don't want to use CUDNN's CTC implementation
+        batch = [
+            torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device)
+            for y in labels
+        ]
         return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
     def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
         ids = list(zip(*groupby(ids.tolist())))[0]  # Remove duplicate tokens
         ids = [x for x in ids if x != self.blank_id]  # Remove BLANKs
         # `probs` is just pass-through since all positions are considered part of the path
+        return probs, ids