from refactor_analysis import RefactorAnalysis from transformers import AutoTokenizer, AutoModel import torch model_name = "huggingface/CodeBERTa-small-v1" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) tokenized_inputs =[tokenizer(file_content, return_tensors="pt") for file_content in RefactorAnalysis()._parent_child_commit_map()] with torch.no_grad(): outputs = [model(**input) for input in tokenized_inputs] embeddings = [output.last_hidden_state.mean(dim=1).squeeze() for output in outputs] # print(RefactorAnalysis()._parent_child_commit_map()) print(embeddings[0].shape)