File size: 6,405 Bytes
0efffc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip -q install sentencepiece\n",
    "# pip -q install numpy\n",
    "# pip -q install sentence_transformers\n",
    "# pip -q install datasets\n",
    "import sentencepiece as spm\n",
    "import numpy as np\n",
    "from datasets import load_dataset\n",
    "from collections import Counter\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SentenceTransformer('all-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"My favourite food is anything I didn't have to cook myself.\", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING', 'To make her feel threatened', 'Dirty Southern Wankers', \"OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.\", 'Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait 😝', 'We need more boards and to create a bit more space for [NAME]. Then we’ll be good.', 'Damn youtube and outrage drama is super lucrative for reddit', 'It might be linked to the trust factor of your friend.']\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"go_emotions\")\n",
    "texts = dataset[\"train\"][\"text\"]\n",
    "print(texts[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def binarize(embeddings, sensitivity=0.1):\n",
    "\treturn np.where(embeddings >= sensitivity, 1, 0)\n",
    "\n",
    "def preprocess(strings):\n",
    "\treturn \"\\n\".join([\"\".join(map(str, s)) for s in processed_string])\n",
    "\n",
    "# Obtain sentence embeddings\n",
    "embeddings = model.encode(texts)\n",
    "binary_hashes = binarize(embeddings)\n",
    "binary_string = preprocess(binary_hashes)\n",
    "print(binary_string[:500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save passage to a temporary file\n",
    "with open(\"passage.txt\", \"w\") as f:\n",
    "\tf.write(binary_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Training options documentation: https://github.com/google/sentencepiece/blob/master/doc/options.md\n",
    "# Training takes 3 hours to complete on GTX 1650 mobile\n",
    "spm.SentencePieceTrainer.train(\n",
    "\tinput='passage.txt',\n",
    "\tmodel_prefix='384_bit_comp',\n",
    "\tvocab_size=256 + 3, # To exclude <unk>, </s>, <s>\n",
    "\tcharacter_coverage=1.00,\n",
    "\tmax_sentencepiece_length=384,\n",
    "\tmodel_type='unigram',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "length: 13\n",
      "encoded_tokens: ['▁0000000', '0000000000000001000000000000000000000', '00000000001000100', '1000000', '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000', '00000000000000000001000000000000000000000000000000000', '0000000000000000000000000000000001000', '00000000000000000000000100000000000000000', '00000000010', '0000000000000000000000000000000000000100', '00000000000100000000000000000', '00000000010', '00001000']\n",
      "encoded_ids: 1ab2ed09d7a9617206894e0608\n",
      "same?: True\n",
      "count: Counter({'00000000010': 2, '▁0000000': 1, '0000000000000001000000000000000000000': 1, '00000000001000100': 1, '1000000': 1, '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000': 1, '00000000000000000001000000000000000000000000000000000': 1, '0000000000000000000000000000000001000': 1, '00000000000000000000000100000000000000000': 1, '0000000000000000000000000000000000000100': 1, '00000000000100000000000000000': 1, '00001000': 1})\n"
     ]
    }
   ],
   "source": [
    "bpe_processor = spm.SentencePieceProcessor(model_file='384_bit_comp.model')\n",
    "\n",
    "def encode_id(bit_text):\n",
    "\tencoded_pieces = bpe_processor.encode_as_pieces(bit_text)\n",
    "\tencoded_ids = [bpe_processor.piece_to_id(s) - 3 for s in encoded_pieces]\n",
    "\tassert any([id_ <= 255 for id_ in encoded_ids])\n",
    "\tstring_ids = \"\".join([format(id_, \"02x\") for id_ in encoded_ids])\n",
    "\treturn string_ids\n",
    "\n",
    "def decode_id(hex_string):\n",
    "\tu8_array = np.frombuffer(bytes.fromhex(hex_string), dtype='<u1') + 3\n",
    "\tencoded_tokens = [bpe_processor.id_to_piece(int(id_)) for id_ in u8_array]\n",
    "\treturn encoded_tokens\n",
    "\n",
    "# Encode text\n",
    "new_sentence = \"000000000000000000000010000000000000000000000000000000100010010000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000100000000000000000000000000100000000000000000000000000000000000000100000000000001000000000000000000000000001000001000\"\n",
    "encoded_tokens = bpe_processor.encode_as_pieces(new_sentence)\n",
    "encoded_ids = encode_id(new_sentence)\n",
    "decoded_tokens = decode_id(encoded_ids)\n",
    "\n",
    "print(\"length:\", len(encoded_tokens))\n",
    "print(\"encoded_tokens:\", encoded_tokens)\n",
    "print(\"encoded_ids:\", encoded_ids)\n",
    "print(\"same?:\", encoded_tokens == decoded_tokens)\n",
    "\n",
    "count = Counter(encoded_tokens)\n",
    "print(\"count:\", count)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}