jRefactoring / archive /codeBert.py
gautam-shetty's picture
Initial commit
a5fb347
raw
history blame contribute delete
642 Bytes
from refactor_analysis import RefactorAnalysis
from transformers import AutoTokenizer, AutoModel
import torch
model_name = "huggingface/CodeBERTa-small-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenized_inputs =[tokenizer(file_content, return_tensors="pt") for file_content in RefactorAnalysis()._parent_child_commit_map()]
with torch.no_grad():
outputs = [model(**input) for input in tokenized_inputs]
embeddings = [output.last_hidden_state.mean(dim=1).squeeze() for output in outputs]
# print(RefactorAnalysis()._parent_child_commit_map())
print(embeddings[0].shape)