if001 commited on
Commit
bbd068b
1 Parent(s): de00515

load from cache

Browse files
Files changed (1) hide show
  1. sentencepiece_ja.py +10 -4
sentencepiece_ja.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
  from typing import Union, List, Optional, Tuple
3
 
4
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
5
-
6
  class SentencePieceJA(PreTrainedTokenizer):
7
  def __init__(self,
8
  model_path = "./tokenizer.json",
@@ -11,9 +11,15 @@ class SentencePieceJA(PreTrainedTokenizer):
11
  eos = "<EOS>",
12
  unk = "<UNK>",
13
  mask = "<MASK>",
14
- **kwargs):
15
  from tokenizers import Tokenizer
16
- self._tokenizer = Tokenizer.from_file(model_path)
 
 
 
 
 
 
17
  super().__init__(**kwargs)
18
  self.add_special_tokens({
19
  'pad_token': pad,
 
1
  import os
2
  from typing import Union, List, Optional, Tuple
3
 
4
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, AutoTokenizer
5
+ from transformers.utils.hub import cached_file
6
  class SentencePieceJA(PreTrainedTokenizer):
7
  def __init__(self,
8
  model_path = "./tokenizer.json",
 
11
  eos = "<EOS>",
12
  unk = "<UNK>",
13
  mask = "<MASK>",
14
+ **kwargs):
15
  from tokenizers import Tokenizer
16
+ try:
17
+ self._tokenizer = Tokenizer.from_file(model_path)
18
+ except Exception as e:
19
+ print('exception: ', e)
20
+ print('load from cache...')
21
+ model_path = cached_file('if001/sentencepiece_ja', 'tokenizer.json')
22
+ self._tokenizer = Tokenizer.from_file(model_path)
23
  super().__init__(**kwargs)
24
  self.add_special_tokens({
25
  'pad_token': pad,