Upload model
Browse files- .gitattributes +1 -0
- config.json +3 -0
- config_gzipembed.py +28 -0
- model.safetensors +3 -0
- modeling_gzipembed.py +53 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
config.json filter=lfs diff=lfs merge=lfs -text
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:26df1c5bae0d1013f5211eb1ce268622d68d8dfccdd186759695ad355dd2c473
|
3 |
+
size 402241252
|
config_gzipembed.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from typing import List
|
4 |
+
import nltk
|
5 |
+
nltk.download('stopwords')
|
6 |
+
nltk.download('punkt')
|
7 |
+
|
8 |
+
class GZIPEmbeddingConfig(PretrainedConfig):
|
9 |
+
model_type = "gzipembed"
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
normalize = True,
|
13 |
+
normalized_corpus = True,
|
14 |
+
reduction = False,
|
15 |
+
reduced_dimension = 0,
|
16 |
+
remove_stop_words = True,
|
17 |
+
stop_words = stopwords.words('english'),
|
18 |
+
corpus = [],
|
19 |
+
**kwargs,
|
20 |
+
):
|
21 |
+
self.corpus = corpus
|
22 |
+
self.normalize = normalize
|
23 |
+
self.normalized_corpus = normalized_corpus
|
24 |
+
self.reduction = reduction
|
25 |
+
self.reduced_dimension = reduced_dimension,
|
26 |
+
self.remove_stop_words = remove_stop_words
|
27 |
+
self.stop_words = stop_words
|
28 |
+
super().__init__(**kwargs)
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:753257f36aa4df7a8fa009cef5293a24b945e66727c9831cafedcf24bfd077ae
|
3 |
+
size 116
|
modeling_gzipembed.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedModel
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from .config_gzipembed import *
|
5 |
+
from tqdm.auto import tqdm
|
6 |
+
import torch
|
7 |
+
import gzip
|
8 |
+
|
9 |
+
class GZIPEmbeddingModel(PreTrainedModel):
|
10 |
+
config_class = GZIPEmbeddingConfig
|
11 |
+
def __init__(self, config):
|
12 |
+
super().__init__(config)
|
13 |
+
if config.reduction:
|
14 |
+
self.reduction_head = torch.nn.Linear(len(config.corpus), config.reduced_dimension)
|
15 |
+
else:
|
16 |
+
self.reduction_head = None
|
17 |
+
self.dummy_parameter = torch.nn.Parameter(torch.ones(1))
|
18 |
+
|
19 |
+
def forward(self, texts, verbose=False, return_tensor=False):
|
20 |
+
x = [self.gzip_embed(self.config.corpus, text, verbose=verbose) for text in texts]
|
21 |
+
if self.reduction_head is not None:
|
22 |
+
x = torch.tensor(x)
|
23 |
+
x = x.to(self.reduction_head.dtype).to(self.reduction_head.device)
|
24 |
+
return self.reduction_head(x)
|
25 |
+
return x if not return_tensor else torch.tensor(x)
|
26 |
+
|
27 |
+
def normalize(self, x):
|
28 |
+
x = ''.join([char for char in x.lower() if char in "abcdefghijklmnopqrstuvwxyz "])
|
29 |
+
x = word_tokenize(x)
|
30 |
+
x = [w for w in x if not w in self.config.stop_words]
|
31 |
+
return ' '.join(x)
|
32 |
+
|
33 |
+
def ncd(self, x, y):
|
34 |
+
_x = self.normalize(x) if self.config.normalize else x
|
35 |
+
_y = self.normalize(y) if (not self.config.normalized_corpus) and self.config.normalize else y
|
36 |
+
x_c = len(gzip.compress(_x.encode()))
|
37 |
+
y_c = len(gzip.compress(_y.encode()))
|
38 |
+
xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
|
39 |
+
return (xy_c-min(x_c,y_c))/max(x_c,y_c)
|
40 |
+
|
41 |
+
def gzip_embed(
|
42 |
+
self,
|
43 |
+
corpus,
|
44 |
+
document,
|
45 |
+
verbose=False,
|
46 |
+
):
|
47 |
+
embedding = []
|
48 |
+
for reference_document in (corpus if not verbose else tqdm(corpus)):
|
49 |
+
embedding.append(self.ncd(reference_document, document))
|
50 |
+
return embedding
|
51 |
+
|
52 |
+
def dimensionality(self):
|
53 |
+
return len(self.config.corpus)
|