Commit
·
c3166be
1
Parent(s):
1a7fe47
Upload 5 files
Browse files- spiece.model +2 -2
- spiece.vocab +0 -0
- spiece_45.model +3 -0
- spiece_45.vocab +0 -0
- tokenizer.py +36 -0
spiece.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:579ebba0921710bb6bd17cd678d4379b4a81ca84756dab644d7e8529bd01009d
|
3 |
+
size 805610
|
spiece.vocab
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
spiece_45.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
|
3 |
+
size 802920
|
spiece_45.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# %pip install sentencepiece
|
2 |
# %pip install datasets
|
|
|
3 |
|
4 |
import unicodedata
|
5 |
import os
|
@@ -136,3 +137,38 @@ spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece
|
|
136 |
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
|
137 |
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# %pip install sentencepiece
|
2 |
# %pip install datasets
|
3 |
+
# %pip install seqio
|
4 |
|
5 |
import unicodedata
|
6 |
import os
|
|
|
137 |
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
|
138 |
|
139 |
|
140 |
+
# Add 100 extra tokens to the model
|
141 |
+
from seqio import SentencePieceVocabulary
|
142 |
+
import os
|
143 |
+
import tensorflow as tf
|
144 |
+
from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2
|
145 |
+
|
146 |
+
|
147 |
+
def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
|
148 |
+
tf.io.gfile.makedirs(out_dir)
|
149 |
+
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)
|
150 |
+
|
151 |
+
model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
|
152 |
+
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
|
153 |
+
'\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
|
154 |
+
)
|
155 |
+
|
156 |
+
|
157 |
+
# vocab = t5.data.get_default_vocabulary()
|
158 |
+
# out_dir = "../vocabulary/cc_all.32000.100extra"
|
159 |
+
#
|
160 |
+
# add_100extra(vocab, out_dir)
|
161 |
+
#
|
162 |
+
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
|
163 |
+
# out_dir = "../vocabulary/nedd.32000.100extra"
|
164 |
+
# add_100extra(vocab, out_dir)
|
165 |
+
#
|
166 |
+
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
|
167 |
+
# out_dir = "../vocabulary/nedd.32000.128extra"
|
168 |
+
# add_100extra(vocab, out_dir)
|
169 |
+
#
|
170 |
+
|
171 |
+
|
172 |
+
vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
|
173 |
+
out_dir = "conv"
|
174 |
+
add_100extra(vocab, out_dir)
|