Spaces:

MarkdenOuden
/

Ancient_Greek_Word2Vec

Runtime error

App Files Files Community

Mark7549 commited on Mar 8

Commit

5f3cfd3

•

1 Parent(s): fd2f101

Add models to repo

Browse files

Files changed (7) hide show

models/.gitattributes +1 -0
models/archaic_cbow.model +3 -0
models/classical_cbow.model +3 -0
models/early_roman_cbow.model +3 -0
models/hellen_cbow.model +3 -0
models/late_roman_cbow.model +3 -0
word2vec.py +56 -3

models/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.model filter=lfs diff=lfs merge=lfs -text

models/archaic_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdd1887db84078af826ae006bf11f884c808342f1ff9da93fd525052eef08204
+size 1647899

models/classical_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50d112100a49d901e45e798591d2040c53bc50c67a48da1e05294f207ed5e2e
+size 6263363

models/early_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f63942fae0974f4c3e39552d2d574a2f4b84e125c648d428a038e6192ec6f3f8
+size 8483329

models/hellen_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027f8bdad4555ad4a4821a65ab2d564275105dda2d02e598e1f5f3435aedd90a
+size 5473215

models/late_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53d66deaf1b14067cead5da52e46e75d0944c2140a9b36782e85f01f2ac454f4
+size 3696190

word2vec.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from gensim.models import Word2Vec
 def load_word2vec_model(model_path):
     '''
@@ -6,12 +7,14 @@ def load_word2vec_model(model_path):
     '''
     return Word2Vec.load(model_path)
 def get_word_vector(model, word):
     '''
         Return the word vector of a word
     '''
     return model.wv[word]
 def iterate_over_words(model):
     '''
         Iterate over all words in the vocabulary and print their vectors
@@ -22,13 +25,63 @@ def iterate_over_words(model):
         print(f'{index} Word: {word}, Vector: {vector}')
         index += 1
 def main():
     model = load_word2vec_model('../models/archaic_cbow.model')
-    vector = get_word_vector(model, 'ἀνήρ')
-    print(vector)
     # Iterate over all words and print their vectors
-    iterate_over_words(model)
 if __name__ == "__main__":
     main()

 from gensim.models import Word2Vec
+from collections import defaultdict
 def load_word2vec_model(model_path):
     '''
     '''
     return Word2Vec.load(model_path)
 def get_word_vector(model, word):
     '''
         Return the word vector of a word
     '''
     return model.wv[word]
 def iterate_over_words(model):
     '''
         Iterate over all words in the vocabulary and print their vectors
         print(f'{index} Word: {word}, Vector: {vector}')
         index += 1
+def model_dictionary(model):
+    '''
+        Return the dictionary of the word2vec model
+        Key is the word and value is the vector of the word
+    '''
+    dict = defaultdict(list)
+    for word, index in model.wv.key_to_index.items():
+        vector = get_word_vector(model, word)
+        dict[word] = vector
+    return dict
+def dot_product(vector_a, vector_b):
+    '''
+        Return the dot product of two vectors
+    '''
+    return sum(a * b for a, b in zip(vector_a, vector_b))
+def magnitude(vector):
+    '''
+        Return the magnitude of a vector
+    '''
+    return sum(x**2 for x in vector) ** 0.5
+def cosine_similarity(vector_a, vector_b):
+    '''
+        Return the cosine similarity of two vectors
+    '''
+    dot_prod = dot_product(vector_a, vector_b)
+    mag_a = magnitude(vector_a)
+    mag_b = magnitude(vector_b)
+    # Avoid division by zero
+    if mag_a == 0 or mag_b == 0:
+        return 0.0
+    similarity = dot_prod / (mag_a * mag_b)
+    return similarity
 def main():
     model = load_word2vec_model('../models/archaic_cbow.model')
+    archaic_cbow_dict = model_dictionary(model)
+    score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
+    print(score)
+    # vector = get_word_vector(model, 'ἀνήρ')
+    # print(vector)
     # Iterate over all words and print their vectors
+    # iterate_over_words(model)
 if __name__ == "__main__":
     main()