import json import os import numpy as np class VectorCache: """ Caches vectors on disk so one can later build an index on them (indexes like IVF requires big amount of vetores for building) """ def __init__(self, filename='vector_cache.memmap', d=768, size=7000000): self.filename = filename self.offset_file = filename + '.offset' self.d = d self.size = size if os.path.isfile(filename): mode = 'r+' self.f = open(self.offset_file, mode) data = json.load(self.f) self.offset = data[0] self.length = data[1] else: mode = 'w+' self.f = open(self.offset_file, mode) self.offset = 0 self.length = 0 self.db = np.memmap(filename, dtype=np.float32, mode='w+', shape=(size, d), order='C') def sync_offset(self): self.f.seek(0) self.f.truncate(0) self.f.write(json.dumps([self.offset, self.length])) def close(self): self.db.flush() self.db.close() self.sync_offset() self.f.flush() self.f.close() def add(self, vs): l = len(vs) to_end = self.size - self.offset if to_end < l: self.add(vs[:to_end]) self.add(vs[to_end:]) return self.db[self.offset:self.offset+l+1, :] = vs self.offset = (self.offset + l + 1) % self.size self.length = min(self.length + l, self.size)