crumb commited on
Commit
ead1d68
1 Parent(s): 7e7e8db

Upload model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ config.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26df1c5bae0d1013f5211eb1ce268622d68d8dfccdd186759695ad355dd2c473
3
+ size 402241252
config_gzipembed.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from nltk.corpus import stopwords
3
+ from typing import List
4
+ import nltk
5
+ nltk.download('stopwords')
6
+ nltk.download('punkt')
7
+
8
+ class GZIPEmbeddingConfig(PretrainedConfig):
9
+ model_type = "gzipembed"
10
+ def __init__(
11
+ self,
12
+ normalize = True,
13
+ normalized_corpus = True,
14
+ reduction = False,
15
+ reduced_dimension = 0,
16
+ remove_stop_words = True,
17
+ stop_words = stopwords.words('english'),
18
+ corpus = [],
19
+ **kwargs,
20
+ ):
21
+ self.corpus = corpus
22
+ self.normalize = normalize
23
+ self.normalized_corpus = normalized_corpus
24
+ self.reduction = reduction
25
+ self.reduced_dimension = reduced_dimension,
26
+ self.remove_stop_words = remove_stop_words
27
+ self.stop_words = stop_words
28
+ super().__init__(**kwargs)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753257f36aa4df7a8fa009cef5293a24b945e66727c9831cafedcf24bfd077ae
3
+ size 116
modeling_gzipembed.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel
2
+ from nltk.corpus import stopwords
3
+ from nltk.tokenize import word_tokenize
4
+ from .config_gzipembed import *
5
+ from tqdm.auto import tqdm
6
+ import torch
7
+ import gzip
8
+
9
+ class GZIPEmbeddingModel(PreTrainedModel):
10
+ config_class = GZIPEmbeddingConfig
11
+ def __init__(self, config):
12
+ super().__init__(config)
13
+ if config.reduction:
14
+ self.reduction_head = torch.nn.Linear(len(config.corpus), config.reduced_dimension)
15
+ else:
16
+ self.reduction_head = None
17
+ self.dummy_parameter = torch.nn.Parameter(torch.ones(1))
18
+
19
+ def forward(self, texts, verbose=False, return_tensor=False):
20
+ x = [self.gzip_embed(self.config.corpus, text, verbose=verbose) for text in texts]
21
+ if self.reduction_head is not None:
22
+ x = torch.tensor(x)
23
+ x = x.to(self.reduction_head.dtype).to(self.reduction_head.device)
24
+ return self.reduction_head(x)
25
+ return x if not return_tensor else torch.tensor(x)
26
+
27
+ def normalize(self, x):
28
+ x = ''.join([char for char in x.lower() if char in "abcdefghijklmnopqrstuvwxyz "])
29
+ x = word_tokenize(x)
30
+ x = [w for w in x if not w in self.config.stop_words]
31
+ return ' '.join(x)
32
+
33
+ def ncd(self, x, y):
34
+ _x = self.normalize(x) if self.config.normalize else x
35
+ _y = self.normalize(y) if (not self.config.normalized_corpus) and self.config.normalize else y
36
+ x_c = len(gzip.compress(_x.encode()))
37
+ y_c = len(gzip.compress(_y.encode()))
38
+ xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
39
+ return (xy_c-min(x_c,y_c))/max(x_c,y_c)
40
+
41
+ def gzip_embed(
42
+ self,
43
+ corpus,
44
+ document,
45
+ verbose=False,
46
+ ):
47
+ embedding = []
48
+ for reference_document in (corpus if not verbose else tqdm(corpus)):
49
+ embedding.append(self.ncd(reference_document, document))
50
+ return embedding
51
+
52
+ def dimensionality(self):
53
+ return len(self.config.corpus)