Upload model
Browse files- config.json +2 -2
- config_gzipembed.py +2 -2
- modeling_gzipembed.py +6 -6
config.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eafe0625df8078f9a08d9469a8e67d2ed836b4d82bf1123f9fa469764ca60b2
|
3 |
+
size 200272042
|
config_gzipembed.py
CHANGED
@@ -20,8 +20,8 @@ class GZIPEmbeddingConfig(PretrainedConfig):
|
|
20 |
):
|
21 |
self.corpus = corpus
|
22 |
self.normalize = normalize
|
23 |
-
self.normalized_corpus = normalized_corpus
|
24 |
-
self.reduction = reduction
|
25 |
self.reduced_dimension = reduced_dimension,
|
26 |
self.remove_stop_words = remove_stop_words
|
27 |
self.stop_words = stop_words
|
|
|
20 |
):
|
21 |
self.corpus = corpus
|
22 |
self.normalize = normalize
|
23 |
+
self.normalized_corpus = normalized_corpus
|
24 |
+
self.reduction = reduction
|
25 |
self.reduced_dimension = reduced_dimension,
|
26 |
self.remove_stop_words = remove_stop_words
|
27 |
self.stop_words = stop_words
|
modeling_gzipembed.py
CHANGED
@@ -21,7 +21,7 @@ class GZIPEmbeddingModel(PreTrainedModel):
|
|
21 |
global p
|
22 |
def calculate_ncd_row(data_row):
|
23 |
i = data_row[0]
|
24 |
-
row = self.ncd(data_row[1],
|
25 |
return i, row
|
26 |
if type(prompt) == str:
|
27 |
prompt = [prompt]
|
@@ -52,18 +52,18 @@ class GZIPEmbeddingModel(PreTrainedModel):
|
|
52 |
x_c = len(gzip.compress(_x.encode()))
|
53 |
y_c = len(gzip.compress(_y.encode()))
|
54 |
xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
|
55 |
-
return (xy_c-min(x_c,y_c))/max(x_c,y_c)
|
56 |
-
|
57 |
def gzip_embed(
|
58 |
self,
|
59 |
-
corpus,
|
60 |
-
document,
|
61 |
verbose=False,
|
62 |
):
|
63 |
embedding = []
|
64 |
for reference_document in (corpus if not verbose else tqdm(corpus)):
|
65 |
embedding.append(self.ncd(reference_document, document))
|
66 |
return embedding
|
67 |
-
|
68 |
def dimensionality(self):
|
69 |
return len(self.config.corpus)
|
|
|
21 |
global p
|
22 |
def calculate_ncd_row(data_row):
|
23 |
i = data_row[0]
|
24 |
+
row = self.ncd(data_row[1], p)
|
25 |
return i, row
|
26 |
if type(prompt) == str:
|
27 |
prompt = [prompt]
|
|
|
52 |
x_c = len(gzip.compress(_x.encode()))
|
53 |
y_c = len(gzip.compress(_y.encode()))
|
54 |
xy_c = len(gzip.compress(f"{_x} {_y}".encode()))
|
55 |
+
return (xy_c-min(x_c,y_c))/max(x_c,y_c)
|
56 |
+
|
57 |
def gzip_embed(
|
58 |
self,
|
59 |
+
corpus,
|
60 |
+
document,
|
61 |
verbose=False,
|
62 |
):
|
63 |
embedding = []
|
64 |
for reference_document in (corpus if not verbose else tqdm(corpus)):
|
65 |
embedding.append(self.ncd(reference_document, document))
|
66 |
return embedding
|
67 |
+
|
68 |
def dimensionality(self):
|
69 |
return len(self.config.corpus)
|