from gensim.models import Word2Vec from collections import defaultdict def load_word2vec_model(model_path): ''' Load a word2vec model from a file ''' return Word2Vec.load(model_path) def get_word_vector(model, word): ''' Return the word vector of a word ''' return model.wv[word] def iterate_over_words(model): ''' Iterate over all words in the vocabulary and print their vectors ''' index = 0 for word, index in model.wv.key_to_index.items(): vector = get_word_vector(model, word) print(f'{index} Word: {word}, Vector: {vector}') index += 1 def model_dictionary(model): ''' Return the dictionary of the word2vec model Key is the word and value is the vector of the word ''' dict = defaultdict(list) for word, index in model.wv.key_to_index.items(): vector = get_word_vector(model, word) dict[word] = vector return dict def dot_product(vector_a, vector_b): ''' Return the dot product of two vectors ''' return sum(a * b for a, b in zip(vector_a, vector_b)) def magnitude(vector): ''' Return the magnitude of a vector ''' return sum(x**2 for x in vector) ** 0.5 def cosine_similarity(vector_a, vector_b): ''' Return the cosine similarity of two vectors ''' dot_prod = dot_product(vector_a, vector_b) mag_a = magnitude(vector_a) mag_b = magnitude(vector_b) # Avoid division by zero if mag_a == 0 or mag_b == 0: return 0.0 similarity = dot_prod / (mag_a * mag_b) return similarity def main(): model = load_word2vec_model('../models/archaic_cbow.model') archaic_cbow_dict = model_dictionary(model) score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον']) print(score) # vector = get_word_vector(model, 'ἀνήρ') # print(vector) # Iterate over all words and print their vectors # iterate_over_words(model) if __name__ == "__main__": main()