crumb commited on
Commit
a2aeb80
1 Parent(s): 8789d56

Upload model

Browse files
Files changed (3) hide show
  1. config.json +2 -2
  2. config_gzipembed.py +2 -2
  3. modeling_gzipembed.py +6 -6
config.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26df1c5bae0d1013f5211eb1ce268622d68d8dfccdd186759695ad355dd2c473
3
- size 402241252
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eafe0625df8078f9a08d9469a8e67d2ed836b4d82bf1123f9fa469764ca60b2
3
+ size 200272042
config_gzipembed.py CHANGED
@@ -20,8 +20,8 @@ class GZIPEmbeddingConfig(PretrainedConfig):
20
  ):
21
  self.corpus = corpus
22
  self.normalize = normalize
23
- self.normalized_corpus = normalized_corpus
24
- self.reduction = reduction
25
  self.reduced_dimension = reduced_dimension,
26
  self.remove_stop_words = remove_stop_words
27
  self.stop_words = stop_words
 
20
  ):
21
  self.corpus = corpus
22
  self.normalize = normalize
23
+ self.normalized_corpus = normalized_corpus
24
+ self.reduction = reduction
25
  self.reduced_dimension = reduced_dimension,
26
  self.remove_stop_words = remove_stop_words
27
  self.stop_words = stop_words
modeling_gzipembed.py CHANGED
@@ -21,7 +21,7 @@ class GZIPEmbeddingModel(PreTrainedModel):
21
  global p
22
  def calculate_ncd_row(data_row):
23
  i = data_row[0]
24
- row = self.ncd(data_row[1], prompt)
25
  return i, row
26
  if type(prompt) == str:
27
  prompt = [prompt]
@@ -52,18 +52,18 @@ class GZIPEmbeddingModel(PreTrainedModel):
52
  x_c = len(gzip.compress(_x.encode()))
53
  y_c = len(gzip.compress(_y.encode()))
54
  xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
55
- return (xy_c-min(x_c,y_c))/max(x_c,y_c)
56
-
57
  def gzip_embed(
58
  self,
59
- corpus,
60
- document,
61
  verbose=False,
62
  ):
63
  embedding = []
64
  for reference_document in (corpus if not verbose else tqdm(corpus)):
65
  embedding.append(self.ncd(reference_document, document))
66
  return embedding
67
-
68
  def dimensionality(self):
69
  return len(self.config.corpus)
 
21
  global p
22
  def calculate_ncd_row(data_row):
23
  i = data_row[0]
24
+ row = self.ncd(data_row[1], p)
25
  return i, row
26
  if type(prompt) == str:
27
  prompt = [prompt]
 
52
  x_c = len(gzip.compress(_x.encode()))
53
  y_c = len(gzip.compress(_y.encode()))
54
  xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
55
+ return (xy_c-min(x_c,y_c))/max(x_c,y_c)
56
+
57
  def gzip_embed(
58
  self,
59
+ corpus,
60
+ document,
61
  verbose=False,
62
  ):
63
  embedding = []
64
  for reference_document in (corpus if not verbose else tqdm(corpus)):
65
  embedding.append(self.ncd(reference_document, document))
66
  return embedding
67
+
68
  def dimensionality(self):
69
  return len(self.config.corpus)