File size: 1,540 Bytes
6bc49a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import json
import os
import numpy as np
class VectorCache:
"""
Caches vectors on disk so one can later build an index on them (indexes like IVF requires big amount of vetores for building)
"""
def __init__(self, filename='vector_cache.memmap', d=768, size=7000000):
self.filename = filename
self.offset_file = filename + '.offset'
self.d = d
self.size = size
if os.path.isfile(filename):
mode = 'r+'
self.f = open(self.offset_file, mode)
data = json.load(self.f)
self.offset = data[0]
self.length = data[1]
else:
mode = 'w+'
self.f = open(self.offset_file, mode)
self.offset = 0
self.length = 0
self.db = np.memmap(filename, dtype=np.float32, mode='w+',
shape=(size, d), order='C')
def sync_offset(self):
self.f.seek(0)
self.f.truncate(0)
self.f.write(json.dumps([self.offset, self.length]))
def close(self):
self.db.flush()
self.db.close()
self.sync_offset()
self.f.flush()
self.f.close()
def add(self, vs):
l = len(vs)
to_end = self.size - self.offset
if to_end < l:
self.add(vs[:to_end])
self.add(vs[to_end:])
return
self.db[self.offset:self.offset+l+1, :] = vs
self.offset = (self.offset + l + 1) % self.size
self.length = min(self.length + l, self.size)
|