File size: 6,405 Bytes
0efffc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pip -q install sentencepiece\n",
"# pip -q install numpy\n",
"# pip -q install sentence_transformers\n",
"# pip -q install datasets\n",
"import sentencepiece as spm\n",
"import numpy as np\n",
"from datasets import load_dataset\n",
"from collections import Counter\n",
"from sentence_transformers import SentenceTransformer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer('all-MiniLM-L6-v2')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\"My favourite food is anything I didn't have to cook myself.\", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING', 'To make her feel threatened', 'Dirty Southern Wankers', \"OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.\", 'Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait 😝', 'We need more boards and to create a bit more space for [NAME]. Then we’ll be good.', 'Damn youtube and outrage drama is super lucrative for reddit', 'It might be linked to the trust factor of your friend.']\n"
]
}
],
"source": [
"dataset = load_dataset(\"go_emotions\")\n",
"texts = dataset[\"train\"][\"text\"]\n",
"print(texts[:10])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def binarize(embeddings, sensitivity=0.1):\n",
"\treturn np.where(embeddings >= sensitivity, 1, 0)\n",
"\n",
"def preprocess(strings):\n",
"\treturn \"\\n\".join([\"\".join(map(str, s)) for s in processed_string])\n",
"\n",
"# Obtain sentence embeddings\n",
"embeddings = model.encode(texts)\n",
"binary_hashes = binarize(embeddings)\n",
"binary_string = preprocess(binary_hashes)\n",
"print(binary_string[:500])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save passage to a temporary file\n",
"with open(\"passage.txt\", \"w\") as f:\n",
"\tf.write(binary_string)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Training options documentation: https://github.com/google/sentencepiece/blob/master/doc/options.md\n",
"# Training takes 3 hours to complete on GTX 1650 mobile\n",
"spm.SentencePieceTrainer.train(\n",
"\tinput='passage.txt',\n",
"\tmodel_prefix='384_bit_comp',\n",
"\tvocab_size=256 + 3, # To exclude <unk>, </s>, <s>\n",
"\tcharacter_coverage=1.00,\n",
"\tmax_sentencepiece_length=384,\n",
"\tmodel_type='unigram',\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"length: 13\n",
"encoded_tokens: ['▁0000000', '0000000000000001000000000000000000000', '00000000001000100', '1000000', '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000', '00000000000000000001000000000000000000000000000000000', '0000000000000000000000000000000001000', '00000000000000000000000100000000000000000', '00000000010', '0000000000000000000000000000000000000100', '00000000000100000000000000000', '00000000010', '00001000']\n",
"encoded_ids: 1ab2ed09d7a9617206894e0608\n",
"same?: True\n",
"count: Counter({'00000000010': 2, '▁0000000': 1, '0000000000000001000000000000000000000': 1, '00000000001000100': 1, '1000000': 1, '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000': 1, '00000000000000000001000000000000000000000000000000000': 1, '0000000000000000000000000000000001000': 1, '00000000000000000000000100000000000000000': 1, '0000000000000000000000000000000000000100': 1, '00000000000100000000000000000': 1, '00001000': 1})\n"
]
}
],
"source": [
"bpe_processor = spm.SentencePieceProcessor(model_file='384_bit_comp.model')\n",
"\n",
"def encode_id(bit_text):\n",
"\tencoded_pieces = bpe_processor.encode_as_pieces(bit_text)\n",
"\tencoded_ids = [bpe_processor.piece_to_id(s) - 3 for s in encoded_pieces]\n",
"\tassert any([id_ <= 255 for id_ in encoded_ids])\n",
"\tstring_ids = \"\".join([format(id_, \"02x\") for id_ in encoded_ids])\n",
"\treturn string_ids\n",
"\n",
"def decode_id(hex_string):\n",
"\tu8_array = np.frombuffer(bytes.fromhex(hex_string), dtype='<u1') + 3\n",
"\tencoded_tokens = [bpe_processor.id_to_piece(int(id_)) for id_ in u8_array]\n",
"\treturn encoded_tokens\n",
"\n",
"# Encode text\n",
"new_sentence = \"000000000000000000000010000000000000000000000000000000100010010000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000100000000000000000000000000100000000000000000000000000000000000000100000000000001000000000000000000000000001000001000\"\n",
"encoded_tokens = bpe_processor.encode_as_pieces(new_sentence)\n",
"encoded_ids = encode_id(new_sentence)\n",
"decoded_tokens = decode_id(encoded_ids)\n",
"\n",
"print(\"length:\", len(encoded_tokens))\n",
"print(\"encoded_tokens:\", encoded_tokens)\n",
"print(\"encoded_ids:\", encoded_ids)\n",
"print(\"same?:\", encoded_tokens == decoded_tokens)\n",
"\n",
"count = Counter(encoded_tokens)\n",
"print(\"count:\", count)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|