Upload 5 files

Files changed (5) hide show

spiece.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
-size 802920

 version https://git-lfs.github.com/spec/v1
+oid sha256:579ebba0921710bb6bd17cd678d4379b4a81ca84756dab644d7e8529bd01009d
+size 805610

spiece.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

spiece_45.model ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
+size 802920

spiece_45.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # %pip install sentencepiece
 # %pip install datasets
 import unicodedata
 import os
@@ -136,3 +137,38 @@ spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece
                                 num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)

 # %pip install sentencepiece
 # %pip install datasets
+# %pip install seqio
 import unicodedata
 import os
                                 num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
+# Add 100 extra tokens to the model
+from seqio import SentencePieceVocabulary
+import os
+import tensorflow as tf
+from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2
+def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
+    tf.io.gfile.makedirs(out_dir)
+    tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)
+    model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
+    tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
+        '\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
+    )
+# vocab = t5.data.get_default_vocabulary()
+# out_dir = "../vocabulary/cc_all.32000.100extra"
+#
+# add_100extra(vocab, out_dir)
+#
+# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
+# out_dir = "../vocabulary/nedd.32000.100extra"
+# add_100extra(vocab, out_dir)
+#
+# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
+# out_dir = "../vocabulary/nedd.32000.128extra"
+# add_100extra(vocab, out_dir)
+#
+vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
+out_dir = "conv"
+add_100extra(vocab, out_dir)