Mark7549 commited on
Commit
5f3cfd3
1 Parent(s): fd2f101

Add models to repo

Browse files
models/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.model filter=lfs diff=lfs merge=lfs -text
models/archaic_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdd1887db84078af826ae006bf11f884c808342f1ff9da93fd525052eef08204
3
+ size 1647899
models/classical_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a50d112100a49d901e45e798591d2040c53bc50c67a48da1e05294f207ed5e2e
3
+ size 6263363
models/early_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f63942fae0974f4c3e39552d2d574a2f4b84e125c648d428a038e6192ec6f3f8
3
+ size 8483329
models/hellen_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027f8bdad4555ad4a4821a65ab2d564275105dda2d02e598e1f5f3435aedd90a
3
+ size 5473215
models/late_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d66deaf1b14067cead5da52e46e75d0944c2140a9b36782e85f01f2ac454f4
3
+ size 3696190
word2vec.py CHANGED
@@ -1,4 +1,5 @@
1
  from gensim.models import Word2Vec
 
2
 
3
  def load_word2vec_model(model_path):
4
  '''
@@ -6,12 +7,14 @@ def load_word2vec_model(model_path):
6
  '''
7
  return Word2Vec.load(model_path)
8
 
 
9
  def get_word_vector(model, word):
10
  '''
11
  Return the word vector of a word
12
  '''
13
  return model.wv[word]
14
 
 
15
  def iterate_over_words(model):
16
  '''
17
  Iterate over all words in the vocabulary and print their vectors
@@ -22,13 +25,63 @@ def iterate_over_words(model):
22
  print(f'{index} Word: {word}, Vector: {vector}')
23
  index += 1
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def main():
26
  model = load_word2vec_model('../models/archaic_cbow.model')
27
- vector = get_word_vector(model, 'ἀνήρ')
28
- print(vector)
 
 
 
 
 
29
 
30
  # Iterate over all words and print their vectors
31
- iterate_over_words(model)
 
32
 
33
  if __name__ == "__main__":
34
  main()
 
1
  from gensim.models import Word2Vec
2
+ from collections import defaultdict
3
 
4
  def load_word2vec_model(model_path):
5
  '''
 
7
  '''
8
  return Word2Vec.load(model_path)
9
 
10
+
11
  def get_word_vector(model, word):
12
  '''
13
  Return the word vector of a word
14
  '''
15
  return model.wv[word]
16
 
17
+
18
  def iterate_over_words(model):
19
  '''
20
  Iterate over all words in the vocabulary and print their vectors
 
25
  print(f'{index} Word: {word}, Vector: {vector}')
26
  index += 1
27
 
28
+
29
+ def model_dictionary(model):
30
+ '''
31
+ Return the dictionary of the word2vec model
32
+ Key is the word and value is the vector of the word
33
+ '''
34
+ dict = defaultdict(list)
35
+ for word, index in model.wv.key_to_index.items():
36
+ vector = get_word_vector(model, word)
37
+ dict[word] = vector
38
+
39
+ return dict
40
+
41
+
42
+ def dot_product(vector_a, vector_b):
43
+ '''
44
+ Return the dot product of two vectors
45
+ '''
46
+ return sum(a * b for a, b in zip(vector_a, vector_b))
47
+
48
+
49
+ def magnitude(vector):
50
+ '''
51
+ Return the magnitude of a vector
52
+ '''
53
+ return sum(x**2 for x in vector) ** 0.5
54
+
55
+
56
+ def cosine_similarity(vector_a, vector_b):
57
+ '''
58
+ Return the cosine similarity of two vectors
59
+ '''
60
+ dot_prod = dot_product(vector_a, vector_b)
61
+ mag_a = magnitude(vector_a)
62
+ mag_b = magnitude(vector_b)
63
+
64
+ # Avoid division by zero
65
+ if mag_a == 0 or mag_b == 0:
66
+ return 0.0
67
+
68
+ similarity = dot_prod / (mag_a * mag_b)
69
+ return similarity
70
+
71
+
72
  def main():
73
  model = load_word2vec_model('../models/archaic_cbow.model')
74
+ archaic_cbow_dict = model_dictionary(model)
75
+
76
+ score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
77
+ print(score)
78
+
79
+ # vector = get_word_vector(model, 'ἀνήρ')
80
+ # print(vector)
81
 
82
  # Iterate over all words and print their vectors
83
+ # iterate_over_words(model)
84
+
85
 
86
  if __name__ == "__main__":
87
  main()