nobu-g commited on
Commit
299710f
1 Parent(s): ff444a8

Update README.md, tokenization_deberta_v2_jumanpp.py, tokenization_deberta_v2_jumanpp_fast.py

Browse files
README.md CHANGED
@@ -29,8 +29,8 @@ You can use this model for masked language modeling as follows:
29
 
30
  ```python
31
  from transformers import AutoTokenizer, AutoModelForMaskedLM
32
- tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese', trust_remote_code=True)
33
- model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
34
 
35
  sentence = '京都大学で自然言語処理を[MASK]する。'
36
  encoding = tokenizer(sentence, return_tensors='pt')
@@ -41,9 +41,8 @@ You can also fine-tune this model on downstream tasks.
41
 
42
  ## Tokenization
43
 
44
- ~~The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).~~
45
-
46
- UPDATE: The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer(Fast)`, so there's no need to segment it in advance. To use `DebertaV2JumanppTokenizer(Fast)`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
47
 
48
  ## Training data
49
 
 
29
 
30
  ```python
31
  from transformers import AutoTokenizer, AutoModelForMaskedLM
32
+ tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp', trust_remote_code=True)
33
+ model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp')
34
 
35
  sentence = '京都大学で自然言語処理を[MASK]する。'
36
  encoding = tokenizer(sentence, return_tensors='pt')
 
41
 
42
  ## Tokenization
43
 
44
+ The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, so there's no need to segment it in advance.
45
+ To use `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
 
46
 
47
  ## Training data
48
 
tokenization_deberta_v2_jumanpp.py CHANGED
@@ -24,7 +24,7 @@ class JumanppTokenizer:
24
  "You need to install rhoknp to use JumanppPreTokenizer. "
25
  "See https://github.com/ku-nlp/rhoknp for installation."
26
  )
27
- self.juman = rhoknp.Jumanpp()
28
 
29
  def tokenize(self, text: str) -> str:
30
- return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])
 
24
  "You need to install rhoknp to use JumanppPreTokenizer. "
25
  "See https://github.com/ku-nlp/rhoknp for installation."
26
  )
27
+ self.jumanpp = rhoknp.Jumanpp()
28
 
29
  def tokenize(self, text: str) -> str:
30
+ return " ".join([morpheme.surf for morpheme in self.jumanpp.apply_to_sentence(text).morphemes])
tokenization_deberta_v2_jumanpp_fast.py CHANGED
@@ -1,4 +1,5 @@
1
  import copy
 
2
 
3
  from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
4
  from transformers import DebertaV2TokenizerFast
@@ -54,11 +55,11 @@ class JumanppPreTokenizer:
54
  "You need to install rhoknp to use JumanppPreTokenizer. "
55
  "See https://github.com/ku-nlp/rhoknp for installation."
56
  )
57
- self.juman = rhoknp.Jumanpp()
58
 
59
  def pre_tokenize(self, pretok: PreTokenizedString):
60
  pretok.split(self.jumanpp_split)
61
 
62
- def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
63
- offsets = [morpheme.span for morpheme in self.juman.apply_to_sentence(str(normalized_string)).morphemes]
64
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]
 
1
  import copy
2
+ from typing import List
3
 
4
  from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
5
  from transformers import DebertaV2TokenizerFast
 
55
  "You need to install rhoknp to use JumanppPreTokenizer. "
56
  "See https://github.com/ku-nlp/rhoknp for installation."
57
  )
58
+ self.jumanpp = rhoknp.Jumanpp()
59
 
60
  def pre_tokenize(self, pretok: PreTokenizedString):
61
  pretok.split(self.jumanpp_split)
62
 
63
+ def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
64
+ offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
65
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]