Siddharth63 commited on
Commit
c3166be
·
1 Parent(s): 1a7fe47

Upload 5 files

Browse files
Files changed (5) hide show
  1. spiece.model +2 -2
  2. spiece.vocab +0 -0
  3. spiece_45.model +3 -0
  4. spiece_45.vocab +0 -0
  5. tokenizer.py +36 -0
spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
3
- size 802920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:579ebba0921710bb6bd17cd678d4379b4a81ca84756dab644d7e8529bd01009d
3
+ size 805610
spiece.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
spiece_45.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
3
+ size 802920
spiece_45.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.py CHANGED
@@ -1,5 +1,6 @@
1
  # %pip install sentencepiece
2
  # %pip install datasets
 
3
 
4
  import unicodedata
5
  import os
@@ -136,3 +137,38 @@ spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece
136
  num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # %pip install sentencepiece
2
  # %pip install datasets
3
+ # %pip install seqio
4
 
5
  import unicodedata
6
  import os
 
137
  num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
138
 
139
 
140
+ # Add 100 extra tokens to the model
141
+ from seqio import SentencePieceVocabulary
142
+ import os
143
+ import tensorflow as tf
144
+ from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2
145
+
146
+
147
+ def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
148
+ tf.io.gfile.makedirs(out_dir)
149
+ tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)
150
+
151
+ model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
152
+ tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
153
+ '\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
154
+ )
155
+
156
+
157
+ # vocab = t5.data.get_default_vocabulary()
158
+ # out_dir = "../vocabulary/cc_all.32000.100extra"
159
+ #
160
+ # add_100extra(vocab, out_dir)
161
+ #
162
+ # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
163
+ # out_dir = "../vocabulary/nedd.32000.100extra"
164
+ # add_100extra(vocab, out_dir)
165
+ #
166
+ # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
167
+ # out_dir = "../vocabulary/nedd.32000.128extra"
168
+ # add_100extra(vocab, out_dir)
169
+ #
170
+
171
+
172
+ vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
173
+ out_dir = "conv"
174
+ add_100extra(vocab, out_dir)