Spaces:

MarkdenOuden
/

Ancient_Greek_Word2Vec

Runtime error

File size: 2,112 Bytes

from gensim.models import Word2Vec
from collections import defaultdict

def load_word2vec_model(model_path):
    '''
        Load a word2vec model from a file
    '''
    return Word2Vec.load(model_path)


def get_word_vector(model, word):
    '''
        Return the word vector of a word
    '''
    return model.wv[word]


def iterate_over_words(model):
    '''
        Iterate over all words in the vocabulary and print their vectors
    '''
    index = 0
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        print(f'{index} Word: {word}, Vector: {vector}')
        index += 1


def model_dictionary(model):
    '''
        Return the dictionary of the word2vec model
        Key is the word and value is the vector of the word
    '''
    dict = defaultdict(list)
    for word, index in model.wv.key_to_index.items():
        vector = get_word_vector(model, word)
        dict[word] = vector
    
    return dict  
    
    
def dot_product(vector_a, vector_b):
    '''
        Return the dot product of two vectors
    '''
    return sum(a * b for a, b in zip(vector_a, vector_b))


def magnitude(vector):
    '''
        Return the magnitude of a vector
    '''
    return sum(x**2 for x in vector) ** 0.5


def cosine_similarity(vector_a, vector_b):
    '''
        Return the cosine similarity of two vectors
    '''
    dot_prod = dot_product(vector_a, vector_b)
    mag_a = magnitude(vector_a)
    mag_b = magnitude(vector_b)

    # Avoid division by zero
    if mag_a == 0 or mag_b == 0:
        return 0.0

    similarity = dot_prod / (mag_a * mag_b)
    return similarity
    

def main():
    model = load_word2vec_model('../models/archaic_cbow.model')
    archaic_cbow_dict = model_dictionary(model)
    
    score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
    print(score)
    
    # vector = get_word_vector(model, 'ἀνήρ')
    # print(vector)

    # Iterate over all words and print their vectors
    # iterate_over_words(model)


if __name__ == "__main__":
    main()