from gensim.models import Word2Vec from collections import defaultdict import os def load_all_models(): ''' Load all word2vec models ''' archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) classical = ('classical', load_word2vec_model('models/classical_cbow.model')) early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) return [archaic, classical, early_roman, hellen, late_roman] def load_word2vec_model(model_path): ''' Load a word2vec model from a file ''' return Word2Vec.load(model_path) def get_word_vector(model, word): ''' Return the word vector of a word ''' return model.wv[word] def iterate_over_words(model): ''' Iterate over all words in the vocabulary and print their vectors ''' index = 0 for word, index in model.wv.key_to_index.items(): vector = get_word_vector(model, word) print(f'{index} Word: {word}, Vector: {vector}') index += 1 def model_dictionary(model): ''' Return the dictionary of the word2vec model Key is the word and value is the vector of the word ''' dict = defaultdict(list) for word, index in model.wv.key_to_index.items(): vector = get_word_vector(model, word) dict[word] = vector return dict def dot_product(vector_a, vector_b): ''' Return the dot product of two vectors ''' return sum(a * b for a, b in zip(vector_a, vector_b)) def magnitude(vector): ''' Return the magnitude of a vector ''' return sum(x**2 for x in vector) ** 0.5 def cosine_similarity(vector_a, vector_b): ''' Return the cosine similarity of two vectors ''' dot_prod = dot_product(vector_a, vector_b) mag_a = magnitude(vector_a) mag_b = magnitude(vector_b) # Avoid division by zero if mag_a == 0 or mag_b == 0: return 0.0 similarity = dot_prod / (mag_a * mag_b) return similarity def get_cosine_similarity(word1, word2, time_slice): ''' Return the cosine similarity of two words ''' # TO DO: MOET NETTER # Return if path does not exist if not os.path.exists(f'models/{time_slice}.model'): return model = load_word2vec_model(f'models/{time_slice}.model') dict = model_dictionary(model) return cosine_similarity(dict[word1], dict[word2]) def get_cosine_similarity_one_word(word, time_slice1, time_slice2): ''' Return the cosine similarity of one word in two different time slices ''' # Return if path does not exist if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'): return model1 = load_word2vec_model(f'models/{time_slice1}.model') model2 = load_word2vec_model(f'models/{time_slice2}.model') dict1 = model_dictionary(model1) dict2 = model_dictionary(model2) return cosine_similarity(dict1[word], dict2[word]) def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()): ''' Return the nearest neighbours of a word word: the word for which the nearest neighbours are calculated time_slice_model: the word2vec model of the time slice of the input word models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models) n: the number of nearest neighbours to return (default: 10) Return: list of tuples with the word, the time slice and the cosine similarity of the nearest neighbours ''' time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model') vector_1 = get_word_vector(time_slice_model, word) nearest_neighbours = [] # Iterate over all models for model in models: model_name = model[0] model = model[1] # Iterate over all words of the model for word, index in model.wv.key_to_index.items(): # Vector of the current word vector_2 = get_word_vector(model, word) # Calculate the cosine similarity between current word and input word cosine_similarity_vectors = cosine_similarity(vector_1, vector_2) # If the list of nearest neighbours is not full yet, add the current word if len(nearest_neighbours) < n: nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity else: smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2]) if cosine_similarity_vectors > smallest_neighbour[2]: nearest_neighbours.remove(smallest_neighbour) nearest_neighbours.append((word, model_name, cosine_similarity_vectors)) return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True) def main(): # model = load_word2vec_model('models/archaic_cbow.model') # archaic_cbow_dict = model_dictionary(model) # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον']) # print(score) archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model')) classical = ('classical', load_word2vec_model('models/classical_cbow.model')) early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model')) hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model')) late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model')) models = [archaic, classical, early_roman, hellen, late_roman] nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5) print(nearest_neighbours) # vector = get_word_vector(model, 'ἀνήρ') # print(vector) # Iterate over all words and print their vectors # iterate_over_words(model) if __name__ == "__main__": main()