cocosoda_ruby / run.py
SalazarPevelll
model
51c57f8
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""
from unittest import removeResult
import torch.nn.functional as F
import argparse
import logging
import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:24"
import pickle
import random
import torch
import json
from random import choice
import numpy as np
from itertools import cycle
from model import Model,Multi_Loss_CoCoSoDa
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
RobertaConfig, RobertaModel, RobertaTokenizer)
logger = logging.getLogger(__name__)
from tqdm import tqdm
import multiprocessing
cpu_cont = 16
from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript
from parser import (remove_comments_and_docstrings,
tree_to_token_index,
index_to_code_token,
tree_to_variable_index)
from tree_sitter import Language, Parser
import sys
sys.path.append("dataset")
torch.cuda.set_per_process_memory_fraction(0.8)
from utils import save_json_data, save_pickle_data
dfg_function={
'python':DFG_python,
'java':DFG_java,
'ruby':DFG_ruby,
'go':DFG_go,
'php':DFG_php,
'javascript':DFG_javascript
}
parsers={}
for lang in dfg_function:
LANGUAGE = Language('parser/my-languages.so', lang)
parser = Parser()
parser.set_language(LANGUAGE)
parser = [parser,dfg_function[lang]]
parsers[lang]= parser
ruby_special_token = ['keyword', 'identifier', 'separators', 'simple_symbol', 'constant', 'instance_variable',
'operator', 'string_content', 'integer', 'escape_sequence', 'comment', 'hash_key_symbol',
'global_variable', 'heredoc_beginning', 'heredoc_content', 'heredoc_end', 'class_variable',]
java_special_token = ['keyword', 'identifier', 'type_identifier', 'separators', 'operator', 'decimal_integer_literal',
'void_type', 'string_literal', 'decimal_floating_point_literal',
'boolean_type', 'null_literal', 'comment', 'hex_integer_literal', 'character_literal']
go_special_token = ['keyword', 'identifier', 'separators', 'type_identifier', 'int_literal', 'operator',
'field_identifier', 'package_identifier', 'comment', 'escape_sequence', 'raw_string_literal',
'rune_literal', 'label_name', 'float_literal']
javascript_special_token =['keyword', 'separators', 'identifier', 'property_identifier', 'operator',
'number', 'string_fragment', 'comment', 'regex_pattern', 'shorthand_property_identifier_pattern',
'shorthand_property_identifier', 'regex_flags', 'escape_sequence', 'statement_identifier']
php_special_token =['text', 'php_tag', 'name', 'operator', 'keyword', 'string', 'integer', 'separators', 'comment',
'escape_sequence', 'ERROR', 'boolean', 'namespace', 'class', 'extends']
python_special_token =['keyword', 'identifier', 'separators', 'operator', '"', 'integer',
'comment', 'none', 'escape_sequence']
special_token={
'python':python_special_token,
'java':java_special_token,
'ruby':ruby_special_token,
'go':go_special_token,
'php':php_special_token,
'javascript':javascript_special_token
}
all_special_token = []
for key, value in special_token.items():
all_special_token = list(set(all_special_token ).union(set(value)))
def lalign(x, y, alpha=2):
x = torch.tensor(x)
y= torch.tensor(y)
return (x - y).norm(dim=1).pow(alpha).mean()
# code2nl_pos = torch.einsum('nc,nc->n', [x, y]).unsqueeze(-1)
# return code2nl_pos.mean()
def lunif(x, t=2):
x = torch.tensor(x)
sq_pdist = torch.pdist(x, p=2).pow(2)
return sq_pdist.mul(-t).exp().mean().log()
def cal_r1_r5_r10(ranks):
r1,r5,r10= 0,0,0
data_len= len(ranks)
for item in ranks:
if item >=1:
r1 +=1
r5 += 1
r10 += 1
elif item >=0.2:
r5+= 1
r10+=1
elif item >=0.1:
r10 +=1
result = {"R@1":round(r1/data_len,3), "R@5": round(r5/data_len,3), "R@10": round(r10/data_len,3)}
return result
#remove comments, tokenize code and extract dataflow
def extract_dataflow(code, parser,lang):
#remove comments
try:
code=remove_comments_and_docstrings(code,lang)
except:
pass
#obtain dataflow
if lang=="php":
code="<?php"+code+"?>"
try:
tree = parser[0].parse(bytes(code,'utf8'))
root_node = tree.root_node
tokens_index=tree_to_token_index(root_node)
code=code.split('\n')
code_tokens=[index_to_code_token(x,code) for x in tokens_index]
index_to_code={}
for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
index_to_code[index]=(idx,code)
try:
DFG,_=parser[1](root_node,index_to_code,{})
except:
DFG=[]
DFG=sorted(DFG,key=lambda x:x[1])
indexs=set()
for d in DFG:
if len(d[-1])!=0:
indexs.add(d[1])
for x in d[-1]:
indexs.add(x)
new_DFG=[]
for d in DFG:
if d[1] in indexs:
new_DFG.append(d)
dfg=new_DFG
except:
dfg=[]
return code_tokens,dfg
#remove comments, tokenize code and extract dataflow
def tokenizer_source_code(code, parser,lang):
#remove comments
try:
code=remove_comments_and_docstrings(code,lang)
except:
pass
#obtain dataflow
if lang=="php":
code="<?php"+code+"?>"
try:
tree = parser[0].parse(bytes(code,'utf8'))
root_node = tree.root_node
tokens_index=tree_to_token_index(root_node)
code=code.split('\n')
code_tokens=[index_to_code_token(x,code) for x in tokens_index]
except:
dfg=[]
return code_tokens
class InputFeatures(object):
"""A single training/test features for a example."""
def __init__(self,
code_tokens,
code_ids,
# position_idx,
# dfg_to_code,
# dfg_to_dfg,
nl_tokens,
nl_ids,
url,
):
self.code_tokens = code_tokens
self.code_ids = code_ids
# self.position_idx=position_idx
# self.dfg_to_code=dfg_to_code
# self.dfg_to_dfg=dfg_to_dfg
self.nl_tokens = nl_tokens
self.nl_ids = nl_ids
self.url=url
class TypeAugInputFeatures(object):
"""A single training/test features for a example."""
def __init__(self,
code_tokens,
code_ids,
# position_idx,
code_type,
code_type_ids,
nl_tokens,
nl_ids,
url,
):
self.code_tokens = code_tokens
self.code_ids = code_ids
# self.position_idx=position_idx
self.code_type=code_type
self.code_type_ids=code_type_ids
self.nl_tokens = nl_tokens
self.nl_ids = nl_ids
self.url=url
def convert_examples_to_features(js):
js,tokenizer,args=js
#code
if args.lang == "java_mini":
parser=parsers["java"]
else:
parser=parsers[js["language"]]
# code
code_tokens=tokenizer_source_code(js['original_string'],parser,args.lang)
code_tokens=" ".join(code_tokens[:args.code_length-2])
code_tokens=tokenizer.tokenize(code_tokens)[:args.code_length-2]
code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
padding_length = args.code_length - len(code_ids)
code_ids+=[tokenizer.pad_token_id]*padding_length
#nl
nl=' '.join(js['docstring_tokens'])
nl_tokens=tokenizer.tokenize(nl)[:args.nl_length-2]
nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
padding_length = args.nl_length - len(nl_ids)
nl_ids+=[tokenizer.pad_token_id]*padding_length
return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'])
def convert_examples_to_features_aug_type(js):
js,tokenizer,args=js
#code
if args.lang == "java_mini":
parser=parsers["java"]
else:
parser=parsers[js["language"]]
# code
token_type_role = js[ 'bpe_token_type_role']
code_token = [item[0] for item in token_type_role]
# code = ' '.join(code_token[:args.code_length-4])
# code_tokens = tokenizer.tokenize(code)[:args.code_length-4]
code_tokens = code_token[:args.code_length-4]
code_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
padding_length = args.code_length - len(code_ids)
code_ids += [tokenizer.pad_token_id]*padding_length
# code type
code_type_token = [item[-1] for item in token_type_role]
# code_type= ' '.join(code_type_token[:args.code_length-4])
# code_type_tokens = tokenizer.tokenize(code_type)[:args.code_length-4]
code_type_tokens = code_type_token[:args.code_length-4]
code_type_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_type_tokens+[tokenizer.sep_token]
code_type_ids = tokenizer.convert_tokens_to_ids(code_type_tokens)
padding_length = args.code_length - len(code_type_ids)
code_type_ids += [tokenizer.pad_token_id]*padding_length
#nl
nl=' '.join(js['docstring_tokens'])
nl_tokens = tokenizer.tokenize(nl)[:args.nl_length-4]
nl_tokens = [tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+nl_tokens+[tokenizer.sep_token]
nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
padding_length = args.nl_length - len(nl_ids)
nl_ids += [tokenizer.pad_token_id]*padding_length
return TypeAugInputFeatures(code_tokens,code_ids,code_type_tokens,code_type_ids,nl_tokens,nl_ids,js['url'])
class TextDataset(Dataset):
def __init__(self, tokenizer, args, file_path=None,pool=None):
self.args=args
prefix=file_path.split('/')[-1][:-6]
cache_file=args.output_dir+'/'+prefix+'.pkl'
n_debug_samples = args.n_debug_samples
# if 'codebase' in file_path:
# n_debug_samples = 100000
if 'train' in file_path:
self.split = "train"
else:
self.split = "other"
if os.path.exists(cache_file):
self.examples=pickle.load(open(cache_file,'rb'))
if args.debug:
self.examples= self.examples[:n_debug_samples]
else:
self.examples = []
data=[]
if args.debug:
with open(file_path, encoding="utf-8") as f:
for line in f:
line=line.strip()
js=json.loads(line)
data.append((js,tokenizer,args))
if len(data) >= n_debug_samples:
break
else:
with open(file_path, encoding="utf-8") as f:
for line in f:
line=line.strip()
js=json.loads(line)
data.append((js,tokenizer,args))
if self.args.data_aug_type == "replace_type":
self.examples=pool.map(convert_examples_to_features_aug_type, tqdm(data,total=len(data)))
else:
self.examples=pool.map(convert_examples_to_features, tqdm(data,total=len(data)))
if 'train' in file_path:
for idx, example in enumerate(self.examples[:3]):
logger.info("*** Example ***")
logger.info("idx: {}".format(idx))
logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))
def __len__(self):
return len(self.examples)
def __getitem__(self, item):
if self.args.data_aug_type == "replace_type":
return (torch.tensor(self.examples[item].code_ids),
torch.tensor(self.examples[item].code_type_ids),
torch.tensor(self.examples[item].nl_ids))
else:
return (torch.tensor(self.examples[item].code_ids),
torch.tensor(self.examples[item].nl_ids),
torch.tensor(self.examples[item].code_tokens),
torch.tensor(self.examples[item].nl_tokens))
def convert_examples_to_features_unixcoder(js,tokenizer,args):
"""convert examples to token ids"""
code = ' '.join(js['code_tokens']) if type(js['code_tokens']) is list else ' '.join(js['code_tokens'].split())
code_tokens = tokenizer.tokenize(code)[:args.code_length-4]
code_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
padding_length = args.code_length - len(code_ids)
code_ids += [tokenizer.pad_token_id]*padding_length
nl = ' '.join(js['docstring_tokens']) if type(js['docstring_tokens']) is list else ' '.join(js['doc'].split())
nl_tokens = tokenizer.tokenize(nl)[:args.nl_length-4]
nl_tokens = [tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+nl_tokens+[tokenizer.sep_token]
nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
padding_length = args.nl_length - len(nl_ids)
nl_ids += [tokenizer.pad_token_id]*padding_length
return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'] if "url" in js else js["retrieval_idx"])
class TextDataset_unixcoder(Dataset):
def __init__(self, tokenizer, args, file_path=None, pooler=None):
self.examples = []
data = []
n_debug_samples = args.n_debug_samples
with open(file_path) as f:
if "jsonl" in file_path:
for line in f:
line = line.strip()
js = json.loads(line)
if 'function_tokens' in js:
js['code_tokens'] = js['function_tokens']
data.append(js)
if args.debug and len(data) >= n_debug_samples:
break
elif "codebase"in file_path or "code_idx_map" in file_path:
js = json.load(f)
for key in js:
temp = {}
temp['code_tokens'] = key.split()
temp["retrieval_idx"] = js[key]
temp['doc'] = ""
temp['docstring_tokens'] = ""
data.append(temp)
if args.debug and len(data) >= n_debug_samples:
break
elif "json" in file_path:
for js in json.load(f):
data.append(js)
if args.debug and len(data) >= n_debug_samples:
break
# if "test" in file_path:
# data = data[-200:]
for js in data:
self.examples.append(convert_examples_to_features_unixcoder(js,tokenizer,args))
if "train" in file_path:
# self.examples = self.examples[:128]
for idx, example in enumerate(self.examples[:3]):
logger.info("*** Example ***")
logger.info("idx: {}".format(idx))
logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))
# return (torch.tensor(self.examples[i].code_ids),
# torch.tensor(self.examples[i].nl_ids),
# [self.examples[i].code_tokens],
# [self.examples[i].nl_tokens])
def set_seed(seed=42):
random.seed(seed)
os.environ['PYHTONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # all gpus
torch.backends.cudnn.deterministic = True
def mask_tokens(inputs,tokenizer,mlm_probability):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, mlm_probability).to(inputs.device)
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in
labels.tolist()] # for masking special token
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool).to(inputs.device), value=0.0)
if tokenizer._pad_token is not None:
padding_mask = labels.eq(tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0) # masked padding
masked_indices = torch.bernoulli(probability_matrix).bool() # will decide who will be masked
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool().to(inputs.device) & masked_indices
inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool().to(inputs.device) & masked_indices & ~indices_replaced
random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long).to(inputs.device)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def replace_with_type_tokens(inputs,replaces,tokenizer,mlm_probability):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, mlm_probability).to(inputs.device)
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in
labels.tolist()] # for masking special token
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool).to(inputs.device), value=0.0)
if tokenizer._pad_token is not None:
padding_mask = labels.eq(tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0) # masked padding
masked_indices = torch.bernoulli(probability_matrix).bool() # will decide who will be masked
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool().to(inputs.device) & masked_indices
inputs[indices_replaced] = replaces[indices_replaced]
return inputs, labels
def replace_special_token_with_type_tokens(inputs, speical_token_ids, tokenizer, mlm_probability):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone()
probability_matrix = torch.full(labels.shape,0.0).to(inputs.device)
probability_matrix.masked_fill_(labels.eq(speical_token_ids).to(inputs.device), value=mlm_probability)
masked_indices = torch.bernoulli(probability_matrix).bool() # will decide who will be masked
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool().to(inputs.device) & masked_indices
inputs[indices_replaced] = speical_token_ids
return inputs, labels
def replace_special_token_with_mask(inputs, speical_token_ids, tokenizer, mlm_probability):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone()
probability_matrix = torch.full(labels.shape,0.0).to(inputs.device)
probability_matrix.masked_fill_(labels.eq(speical_token_ids).to(inputs.device), value=mlm_probability)
masked_indices = torch.bernoulli(probability_matrix).bool() # will decide who will be masked
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool().to(inputs.device) & masked_indices
inputs[indices_replaced] =tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
return inputs, labels
def train(args, model, tokenizer,pool):
""" Train the model """
if args.data_aug_type == "replace_type" :
train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
else:
# if "unixcoder" in args.model_name_or_path or "coco" in args.model_name_or_path :
train_dataset=TextDataset_unixcoder(tokenizer, args, args.train_data_file, pool)
# else:
# train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4,drop_last=True)
model.to(args.device)
if args.local_rank not in [-1, 0]:
torch.distributed.barrier()
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*args.num_train_epochs)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Num quene = %d", args.moco_k)
logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size//args.n_gpu)
logger.info(" Total train batch size = %d", args.train_batch_size)
logger.info(" Total optimization steps = %d", len(train_dataloader)*args.num_train_epochs)
model.zero_grad()
model.train()
tr_num,tr_loss,best_mrr=0,0,-1
loss_fct = CrossEntropyLoss()
# if args.model_type == "multi-loss-cocosoda" :
if args.model_type in ["no_aug_cocosoda", "multi-loss-cocosoda"] :
if args.do_continue_pre_trained:
logger.info("do_continue_pre_trained")
elif args.do_fine_tune:
logger.info("do_fine_tune")
special_token_list = special_token[args.lang]
special_token_id_list = tokenizer.convert_tokens_to_ids(special_token_list)
model_eval = model.module if hasattr(model,'module') else model
for idx in range(args.num_train_epochs):
print(idx)
for step,batch in enumerate(train_dataloader):
#get inputs
code_inputs = batch[0].to(args.device)
nl_inputs = batch[1].to(args.device)
#get code and nl vectors
nl_outputs = model_eval.nl_encoder_q(nl_inputs, attention_mask=nl_inputs.ne(1))
nl_vec =nl_outputs [1]
code_outputs = model_eval.code_encoder_q(code_inputs, attention_mask=code_inputs.ne(1))
code_vec =code_outputs [1]
# code_vec = model(code_inputs=code_inputs)
# nl_vec = model(nl_inputs=nl_inputs)
torch.cuda.empty_cache()
tr_num+=1
#calculate scores and loss
scores = torch.einsum("ab,cb->ac",nl_vec,code_vec)
loss = loss_fct(scores*20, torch.arange(code_inputs.size(0), device=scores.device))
tr_loss += loss.item()
if (step+1)% args.eval_frequency==0:
logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
tr_loss=0
tr_num=0
#backward
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
optimizer.step()
optimizer.zero_grad()
scheduler.step()
torch.cuda.empty_cache()
results = evaluate(args, model, tokenizer,args.eval_data_file, pool, eval_when_training=True)
for key, value in results.items():
logger.info(" %s = %s", key, round(value,4))
#save best model
if results['eval_mrr']>best_mrr:
best_mrr=results['eval_mrr']
logger.info(" "+"*"*20)
logger.info(" Best mrr:%s",round(best_mrr,4))
logger.info(" "+"*"*20)
checkpoint_prefix = 'checkpoint-best-mrr'
output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model,'module') else model
output_dir = os.path.join(output_dir, '{}'.format('model.bin'))
torch.save(model_to_save.state_dict(), output_dir)
logger.info("Saving model checkpoint to %s", output_dir)
output_dir_epoch = os.path.join(args.output_dir, '{}'.format(idx))
if not os.path.exists(output_dir_epoch):
os.makedirs(output_dir_epoch)
model_to_save = model.module if hasattr(model,'module') else model
output_dir_epoch = os.path.join(output_dir_epoch, '{}'.format('model.bin'))
torch.save(model_to_save.state_dict(), output_dir_epoch)
logger.info("Saving model checkpoint to %s", output_dir_epoch)
def multi_lang_continue_pre_train(args, model, tokenizer,pool):
""" Train the model """
#get training dataset
if "unixcoder" in args.model_name_or_path:
train_datasets = []
for train_data_file in args.couninue_pre_train_data_files:
train_dataset=TextDataset_unixcoder(tokenizer, args, train_data_file, pool)
train_datasets.append(train_dataset)
else:
train_datasets = []
for train_data_file in args.couninue_pre_train_data_files:
train_dataset=TextDataset(tokenizer, args, train_data_file, pool)
train_datasets.append(train_dataset)
train_samplers = [RandomSampler(train_dataset) for train_dataset in train_datasets]
# https://blog.csdn.net/weixin_44966641/article/details/124878064
train_dataloaders = [cycle(DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,drop_last=True)) for train_dataset,train_sampler in zip(train_datasets,train_samplers)]
t_total = args.max_steps
#get optimizer and scheduler
# Prepare optimizer and schedule (linear warmup and decay)https://huggingface.co/transformers/v3.3.1/training.html
model.to(args.device)
if args.local_rank not in [-1, 0]:
torch.distributed.barrier()
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.num_warmup_steps,num_training_steps=t_total)
# Train!
training_data_length = sum ([len(item) for item in train_datasets])
logger.info("***** Running training *****")
logger.info(" Num examples = %d", training_data_length)
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Num quene = %d", args.moco_k)
logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size//args.n_gpu)
logger.info(" Total train batch size = %d", args.train_batch_size)
checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last')
scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt')
optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt')
if os.path.exists(scheduler_last):
scheduler.load_state_dict(torch.load(scheduler_last, map_location="cpu"))
if os.path.exists(optimizer_last):
optimizer.load_state_dict(torch.load(optimizer_last, map_location="cpu"))
if args.local_rank == 0:
torch.distributed.barrier()
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank%args.gpu_per_node],
output_device=args.local_rank%args.gpu_per_node,
find_unused_parameters=True)
loss_fct = CrossEntropyLoss()
set_seed(args.seed) # Added here for reproducibility (even between python 2 and 3)
probs=[len(x) for x in train_datasets]
probs=[x/sum(probs) for x in probs]
probs=[x**0.7 for x in probs]
probs=[x/sum(probs) for x in probs]
# global_step = args.start_step
model.zero_grad()
model.train()
global_step = args.start_step
step=0
tr_loss, logging_loss,avg_loss,tr_nb, best_mrr = 0.0, 0.0,0.0,0,-1
tr_num=0
special_token_list = all_special_token
special_token_id_list = tokenizer.convert_tokens_to_ids(special_token_list)
while True:
train_dataloader=np.random.choice(train_dataloaders, 1, p=probs)[0]
# train_dataloader=train_dataloader[0]
step+=1
batch=next(train_dataloader)
# source_ids= batch.to(args.device)
model.train()
# loss = model(source_ids)
code_inputs = batch[0].to(args.device)
code_transformations_ids = code_inputs.clone()
nl_inputs = batch[1].to(args.device)
nl_transformations_ids= nl_inputs.clone()
if step%4 == 0:
code_transformations_ids[:, 3:], _ = mask_tokens(code_inputs.clone()[:, 3:] ,tokenizer,args.mlm_probability)
nl_transformations_ids[:, 3:], _ = mask_tokens(nl_inputs.clone()[:, 3:] ,tokenizer,args.mlm_probability)
elif step%4 == 1:
code_types = code_inputs.clone()
code_transformations_ids[:, 3:], _ = replace_with_type_tokens(code_inputs.clone()[:, 3:] ,code_types.clone()[:, 3:],tokenizer,args.mlm_probability)
elif step%4 == 2:
random.seed( step)
choice_token_id = choice(special_token_id_list)
code_transformations_ids[:, 3:], _ = replace_special_token_with_type_tokens(code_inputs.clone()[:, 3:], choice_token_id, tokenizer,args.mlm_probability)
elif step%4 == 3:
random.seed( step)
choice_token_id = choice(special_token_id_list)
code_transformations_ids[:, 3:], _ = replace_special_token_with_mask(code_inputs.clone()[:, 3:], choice_token_id, tokenizer,args.mlm_probability)
tr_num+=1
inter_output, inter_target, _, _= model(source_code_q=code_inputs, source_code_k=code_transformations_ids,
nl_q=nl_inputs , nl_k=nl_transformations_ids )
# loss_fct = CrossEntropyLoss()
loss = loss_fct(20*inter_output, inter_target)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step+1)% args.eval_frequency==0:
logger.info("step {} loss {}".format(step+1,round(tr_loss/tr_num,5)))
tr_loss=0
tr_num=0
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
scheduler.step()
global_step += 1
output_flag=True
avg_loss=round((tr_loss - logging_loss) /(global_step- tr_nb),6)
if global_step %100 == 0:
logger.info(" global steps (step*gradient_accumulation_steps ): %s loss: %s", global_step, round(avg_loss,6))
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
logging_loss = tr_loss
tr_nb=global_step
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
checkpoint_prefix = 'checkpoint-mrr'
# results = evaluate(args, model, tokenizer,pool=pool,eval_when_training=True)
results = evaluate(args, model, tokenizer,args.eval_data_file, pool, eval_when_training=True)
# for key, value in results.items():
# logger.info(" %s = %s", key, round(value,6))
logger.info(" %s = %s", 'eval_mrr', round(results['eval_mrr'],6))
if results['eval_mrr']>best_mrr:
best_mrr=results['eval_mrr']
logger.info(" "+"*"*20)
logger.info(" Best mrr:%s",round(best_mrr,4))
logger.info(" "+"*"*20)
output_dir = os.path.join(args.output_dir, '{}'.format('checkpoint-best-mrr'))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model,'module') else model
output_dir = os.path.join(output_dir, '{}'.format('model.bin'))
torch.save(model_to_save.state_dict(), output_dir)
logger.info("Saving model checkpoint to %s", output_dir)
# Save model checkpoint
output_dir = os.path.join(args.output_dir, '{}-{}-{}'.format(checkpoint_prefix, global_step,round(results['eval_mrr'],6)))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module.code_encoder_q if hasattr(model,'module') else model.code_encoder_q # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
# _rotate_checkpoints(args, checkpoint_prefix)
last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
if not os.path.exists(last_output_dir):
os.makedirs(last_output_dir)
model_to_save.save_pretrained(last_output_dir)
idx_file = os.path.join(last_output_dir, 'idx_file.txt')
with open(idx_file, 'w', encoding='utf-8') as idxf:
idxf.write(str(0) + '\n')
torch.save(optimizer.state_dict(), os.path.join(last_output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(last_output_dir, "scheduler.pt"))
logger.info("Saving optimizer and scheduler states to %s", last_output_dir)
step_file = os.path.join(last_output_dir, 'step_file.txt')
with open(step_file, 'w', encoding='utf-8') as stepf:
stepf.write(str(global_step) + '\n')
if args.max_steps > 0 and global_step > args.max_steps:
break
def evaluate(args, model, tokenizer,file_name,pool, eval_when_training=False):
# if "unixcoder" in args.model_name_or_path or "coco" in args.model_name_or_path :
dataset_class = TextDataset_unixcoder
# else:
# dataset_class = TextDataset
query_dataset = dataset_class(tokenizer, args, file_name, pool)
query_sampler = SequentialSampler(query_dataset)
query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=args.eval_batch_size,num_workers=4)
code_dataset = dataset_class(tokenizer, args, args.codebase_file, pool)
code_sampler = SequentialSampler(code_dataset)
code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=args.eval_batch_size,num_workers=4)
# multi-gpu evaluate
if args.n_gpu > 1 and eval_when_training is False:
model = torch.nn.DataParallel(model)
# Eval!
logger.info("***** Running evaluation on %s *****"%args.lang)
logger.info(" Num queries = %d", len(query_dataset))
logger.info(" Num codes = %d", len(code_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
model.eval()
model_eval = model.module if hasattr(model,'module') else model
code_vecs=[]
nl_vecs=[]
for batch in query_dataloader:
nl_inputs = batch[-1].to(args.device)
with torch.no_grad():
if args.model_type == "base" :
nl_vec = model(nl_inputs=nl_inputs)
elif args.model_type in ["cocosoda" ,"no_aug_cocosoda", "multi-loss-cocosoda"]:
outputs = model_eval.nl_encoder_q(nl_inputs, attention_mask=nl_inputs.ne(1))
if args.agg_way == "avg":
outputs = outputs [0]
nl_vec = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None] # None作为ndarray或tensor的索引作用是增加维度,
elif args.agg_way == "cls_pooler":
nl_vec =outputs [1]
elif args.agg_way == "avg_cls_pooler":
nl_vec =outputs [1] + (outputs[0]*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None]
nl_vec = torch.nn.functional.normalize( nl_vec, p=2, dim=1)
if args.do_whitening:
nl_vec=whitening_torch_final(nl_vec)
nl_vecs.append(nl_vec.cpu().numpy())
for batch in code_dataloader:
with torch.no_grad():
code_inputs = batch[0].to(args.device)
if args.model_type == "base" :
code_vec = model(code_inputs=code_inputs)
elif args.model_type in ["cocosoda" ,"no_aug_cocosoda", "multi-loss-cocosoda"]:
# code_vec = model_eval.code_encoder_q(code_inputs, attention_mask=code_inputs.ne(1))[1]
outputs = model_eval.code_encoder_q(code_inputs, attention_mask=code_inputs.ne(1))
if args.agg_way == "avg":
outputs = outputs [0]
code_vec = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None] # None作为ndarray或tensor的索引作用是增加维度,
elif args.agg_way == "cls_pooler":
code_vec=outputs [1]
elif args.agg_way == "avg_cls_pooler":
code_vec=outputs [1] + (outputs[0]*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None]
code_vec = torch.nn.functional.normalize(code_vec, p=2, dim=1)
if args.do_whitening:
code_vec=whitening_torch_final(code_vec)
code_vecs.append(code_vec.cpu().numpy())
model.train()
code_vecs=np.concatenate(code_vecs,0)
nl_vecs=np.concatenate(nl_vecs,0)
scores=np.matmul(nl_vecs,code_vecs.T)
sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]
nl_urls=[]
code_urls=[]
for example in query_dataset.examples:
nl_urls.append(example.url)
for example in code_dataset.examples:
code_urls.append(example.url)
ranks=[]
for url, sort_id in zip(nl_urls,sort_ids):
rank=0
find=False
for idx in sort_id[:1000]:
if find is False:
rank+=1
if code_urls[idx]==url:
find=True
if find:
ranks.append(1/rank)
else:
ranks.append(0)
if args.save_evaluation_reuslt:
evaluation_result = {"nl_urls":nl_urls, "code_urls":code_urls,"sort_ids":sort_ids[:,:10],"ranks":ranks}
save_pickle_data(args.save_evaluation_reuslt_dir, "evaluation_result.pkl",evaluation_result)
result = cal_r1_r5_r10(ranks)
result["eval_mrr"] = round(float(np.mean(ranks)),3)
return result
def parse_args():
parser = argparse.ArgumentParser()
# soda
parser.add_argument('--data_aug_type',default="replace_type",choices=["replace_type", "random_mask" ,"other"], help="the ways of soda",required=False)
parser.add_argument('--aug_type_way',default="random_replace_type",choices=["random_replace_type", "replace_special_type" ,"replace_special_type_with_mask"], help="the ways of soda",required=False)
parser.add_argument('--print_align_unif_loss', action='store_true', help='print_align_unif_loss', required=False)
parser.add_argument('--do_ineer_loss', action='store_true', help='print_align_unif_loss', required=False)
parser.add_argument('--only_save_the_nl_code_vec', action='store_true', help='print_align_unif_loss', required=False)
parser.add_argument('--do_zero_short', action='store_true', help='print_align_unif_loss', required=False)
parser.add_argument('--agg_way',default="cls_pooler",choices=["avg", "cls_pooler","avg_cls_pooler" ], help="base is codebert/graphcoder/unixcoder",required=False)
parser.add_argument('--weight_decay',default=0.01, type=float,required=False)
parser.add_argument('--do_single_lang_continue_pre_train', action='store_true', help='do_single_lang_continue_pre_train', required=False)
parser.add_argument('--save_evaluation_reuslt', action='store_true', help='save_evaluation_reuslt', required=False)
parser.add_argument('--save_evaluation_reuslt_dir', type=str, help='save_evaluation_reuslt', required=False)
parser.add_argument('--epoch', type=int, default=50,
help="random seed for initialization")
# new continue pre-training
parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
parser.add_argument("--local_rank", type=int, default=-1,
help="For distributed training: local_rank")
parser.add_argument("--loaded_model_filename", type=str, required=False,
help="loaded_model_filename")
parser.add_argument("--loaded_codebert_model_filename", type=str, required=False,
help="loaded_model_filename")
parser.add_argument('--do_multi_lang_continue_pre_train', action='store_true', help='do_multi_lang_continue_pre_train', required=False)
parser.add_argument("--couninue_pre_train_data_files", default=["dataset/ruby/train.jsonl", "dataset/java/train.jsonl",], type=str, nargs='+', required=False,
help="The input training data files (some json files).")
# parser.add_argument("--couninue_pre_train_data_files", default=["dataset/go/train.jsonl", "dataset/java/train.jsonl",
# "dataset/javascript/train.jsonl", "dataset/php/train.jsonl", "dataset/python/train.jsonl", "dataset/ruby/train.jsonl",], type=list, required=False,
# help="The input training data files (some json files).")
parser.add_argument('--do_continue_pre_trained', action='store_true', help='debug mode', required=False)
parser.add_argument('--do_fine_tune', action='store_true', help='debug mode', required=False)
parser.add_argument('--do_whitening', action='store_true', help='do_whitening https://github.com/Jun-jie-Huang/WhiteningBERT', required=False)
parser.add_argument("--time_score", default=1, type=int,help="cosine value * time_score")
parser.add_argument("--max_steps", default=100, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--num_warmup_steps", default=0, type=int, help="num_warmup_steps")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.")
parser.add_argument('--save_steps', type=int, default=50,
help="Save checkpoint every X updates steps.")
# new moco
parser.add_argument('--moco_type',default="encoder_queue",choices=["encoder_queue","encoder_momentum_encoder_queue" ], help="base is codebert/graphcoder/unixcoder",required=False)
# debug
parser.add_argument('--use_best_mrr_model', action='store_true', help='cosine_space', required=False)
parser.add_argument('--debug', action='store_true', help='debug mode', required=False)
parser.add_argument('--n_debug_samples', type=int, default=100, required=False)
parser.add_argument("--max_codeblock_num", default=10, type=int,
help="Optional NL input sequence length after tokenization.")
parser.add_argument('--hidden_size', type=int, default=768, required=False)
parser.add_argument("--eval_frequency", default=1, type=int, required=False)
parser.add_argument("--mlm_probability", default=0.1, type=float, required=False)
# model type
parser.add_argument('--do_avg', action='store_true', help='avrage hidden status', required=False)
parser.add_argument('--model_type',default="base",choices=["base", "cocosoda","multi-loss-cocosoda","no_aug_cocosoda"], help="base is codebert/graphcoder/unixcoder",required=False)
# moco
# moco specific configs:
parser.add_argument('--moco_dim', default=768, type=int,
help='feature dimension (default: 768)')
parser.add_argument('--moco_k', default=32, type=int,
help='queue size; number of negative keys (default: 65536), which is divided by 32, etc.')
parser.add_argument('--moco_m', default=0.999, type=float,
help='moco momentum of updating key encoder (default: 0.999)')
parser.add_argument('--moco_t', default=0.07, type=float,
help='softmax temperature (default: 0.07)')
# options for moco v2
parser.add_argument('--mlp', action='store_true',help='use mlp head')
## Required parameters
parser.add_argument("--train_data_file", default="dataset/java/train.jsonl", type=str, required=False,
help="The input training data file (a json file).")
parser.add_argument("--output_dir", default="saved_models/pre-train", type=str, required=False,
help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--eval_data_file", default="dataset/java/valid.jsonl", type=str,
help="An optional input evaluation data file to evaluate the MRR(a jsonl file).")
parser.add_argument("--test_data_file", default="dataset/java/test.jsonl", type=str,
help="An optional input test data file to test the MRR(a josnl file).")
parser.add_argument("--codebase_file", default="dataset/java/codebase.jsonl", type=str,
help="An optional input test data file to codebase (a jsonl file).")
parser.add_argument("--lang", default="java", type=str,
help="language.")
parser.add_argument("--model_name_or_path", default="DeepSoftwareAnalytics/CoCoSoDa", type=str,
help="The model checkpoint for weights initialization.")
parser.add_argument("--config_name", default="DeepSoftwareAnalytics/CoCoSoDa", type=str,
help="Optional pretrained config name or path if not the same as model_name_or_path")
parser.add_argument("--tokenizer_name", default="DeepSoftwareAnalytics/CoCoSoDa", type=str,
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
parser.add_argument("--nl_length", default=50, type=int,
help="Optional NL input sequence length after tokenization.")
parser.add_argument("--code_length", default=100, type=int,
help="Optional Code input sequence length after tokenization.")
parser.add_argument("--data_flow_length", default=0, type=int,
help="Optional Data Flow input sequence length after tokenization.",required=False)
parser.add_argument("--do_train", action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval", action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_test", action='store_true',
help="Whether to run eval on the test set.")
parser.add_argument("--train_batch_size", default=4, type=int,
help="Batch size for training.")
parser.add_argument("--eval_batch_size", default=4, type=int,
help="Batch size for evaluation.")
parser.add_argument("--learning_rate", default=2e-5, type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=4, type=int,
help="Total number of training epochs to perform.")
parser.add_argument('--seed', type=int, default=3407,
help="random seed for initialization")
#print arguments
args = parser.parse_args()
return args
def create_model(args,model,tokenizer, config=None):
# logger.info("args.data_aug_type %s"%args.data_aug_type)
# replace token with type
if args.data_aug_type in ["replace_type" , "other"] and not args.only_save_the_nl_code_vec:
special_tokens_dict = {'additional_special_tokens': all_special_token}
logger.info(" new token %s"%(str(special_tokens_dict)))
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if (args.loaded_model_filename) and ("pytorch_model.bin" in args.loaded_model_filename):
logger.info("reload pytorch model from {}".format(args.loaded_model_filename))
model.load_state_dict(torch.load(args.loaded_model_filename),strict=False)
# model.from_pretrain
if args.model_type == "base" :
model= Model(model)
elif args.model_type == "multi-loss-cocosoda":
model= Multi_Loss_CoCoSoDa(model,args, args.mlp)
if (args.loaded_model_filename) and ("pytorch_model.bin" not in args.loaded_model_filename) :
logger.info("reload model from {}".format(args.loaded_model_filename))
model.load_state_dict(torch.load(args.loaded_model_filename))
# model.load_state_dict(torch.load(args.loaded_model_filename,strict=False))
# model.from_pretrained(args.loaded_model_filename)
if (args.loaded_codebert_model_filename) :
logger.info("reload pytorch model from {}".format(args.loaded_codebert_model_filename))
model.load_state_dict(torch.load(args.loaded_codebert_model_filename),strict=False)
logger.info(model.model_parameters())
return model
def main():
args = parse_args()
#set log
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device
logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
pool = multiprocessing.Pool(cpu_cont)
# Set seed
set_seed(args.seed)
#build model
if "codet5" in args.model_name_or_path:
config = T5Config.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
model = model.encoder
else:
config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
model = RobertaModel.from_pretrained(args.model_name_or_path)
model=create_model(args,model,tokenizer,config)
logger.info("Training/evaluation parameters %s", args)
args.start_step = 0
model.to(args.device)
# Training
if args.do_multi_lang_continue_pre_train:
multi_lang_continue_pre_train(args, model, tokenizer, pool)
output_tokenizer_dir = os.path.join(args.output_dir,"tokenzier")
if not os.path.exists(output_tokenizer_dir):
os.makedirs( output_tokenizer_dir)
tokenizer.save_pretrained( output_tokenizer_dir)
if args.do_train:
train(args, model, tokenizer, pool)
# Evaluation
results = {}
if args.do_eval:
checkpoint_prefix = 'checkpoint-best-mrr/model.bin'
output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))
if (not args.only_save_the_nl_code_vec) and (not args.do_zero_short) :
model.load_state_dict(torch.load(output_dir),strict=False)
model.to(args.device)
result=evaluate(args, model, tokenizer,args.eval_data_file, pool)
logger.info("***** Eval valid results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(round(result[key],4)))
if args.do_test:
logger.info("runnning test")
checkpoint_prefix = 'checkpoint-best-mrr/model.bin'
output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))
if (not args.only_save_the_nl_code_vec) and (not args.do_zero_short) :
model.load_state_dict(torch.load(output_dir),strict=False)
model.to(args.device)
result=evaluate(args, model, tokenizer,args.test_data_file, pool)
logger.info("***** Eval test results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(round(result[key],4)))
save_json_data(args.output_dir, "result.jsonl", result)
return results
def gen_vector():
args = parse_args()
#set log
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device
logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
pool = multiprocessing.Pool(cpu_cont)
# Set seed
set_seed(args.seed)
if "codet5" in args.model_name_or_path:
config = T5Config.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
model = model.encoder
else:
config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
model = RobertaModel.from_pretrained(args.model_name_or_path)
model=create_model(args,model,tokenizer,config)
if args.data_aug_type == "replace_type" :
train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
else:
# if "unixcoder" in args.model_name_or_path or "coco" in args.model_name_or_path :
train_dataset=TextDataset_unixcoder(tokenizer, args, args.train_data_file, pool)
# else:
# train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
train_sampler = SequentialSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4,drop_last=False)
for idx in range(args.num_train_epochs):
output_dir_epoch = os.path.join(args.output_dir, '{}'.format(idx))
output_dir_epoch = os.path.join(output_dir_epoch, '{}'.format('model.bin'))
model.load_state_dict(torch.load(output_dir_epoch),strict=False)
model.to(args.device)
model_eval = model.module if hasattr(model,'module') else model
all_nl_vec = []
all_code_vec = []
for step,batch in enumerate(train_dataloader):
code_inputs = batch[0].to(args.device)
nl_inputs = batch[1].to(args.device)
#get code and nl vectors
nl_outputs = model_eval.nl_encoder_q(nl_inputs, attention_mask=nl_inputs.ne(1))
nl_vec =nl_outputs [1]
code_outputs = model_eval.code_encoder_q(code_inputs, attention_mask=code_inputs.ne(1))
code_vec =code_outputs [1]
all_nl_vec.append(nl_vec.detach().cpu().numpy())
all_code_vec.append(code_vec.detach().cpu().numpy())
all_nl_vec = np.concatenate(all_nl_vec, axis=0)
all_code_vec = np.concatenate(all_code_vec, axis=0)
print(all_nl_vec.shape, all_code_vec.shape)
np.save("/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/" + str(idx) + "/all_nl_vec.npy", all_nl_vec)
np.save("/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/" + str(idx) + "/all_code_vec.npy", all_code_vec)
idxs = [i for i in range(len(all_nl_vec))]
for epoch in range(1,2):
idxs_dir_path = "/home/yiming/cocosoda/CoCoSoDa/saved_models/codesearch_contrastive_learning/Model/Epoch_" + str(epoch)
if os.path.exists(idxs_dir_path):
pass
else:
os.mkdir(idxs_dir_path)
idxs_path = idxs_dir_path + "/index.json"
json_file = open(idxs_path, mode='w')
json.dump(idxs, json_file, indent=4)
if args.data_aug_type == "replace_type" :
test_dataset=TextDataset(tokenizer, args, args.test_data_file, pool)
else:
# if "unixcoder" in args.model_name_or_path or "coco" in args.model_name_or_path :
test_dataset=TextDataset_unixcoder(tokenizer, args, args.test_data_file, pool)
# else:
# test_dataset=TextDataset(tokenizer, args, args.test_data_file, pool)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,num_workers=4,drop_last=False)
for idx in range(args.num_train_epochs):
output_dir_epoch = os.path.join(args.output_dir, '{}'.format(idx))
output_dir_epoch = os.path.join(output_dir_epoch, '{}'.format('model.bin'))
model.load_state_dict(torch.load(output_dir_epoch),strict=False)
model.to(args.device)
model_eval = model.module if hasattr(model,'module') else model
all_nl_vec = []
all_code_vec = []
for step,batch in enumerate(test_dataloader):
code_inputs = batch[0].to(args.device)
nl_inputs = batch[1].to(args.device)
#get code and nl vectors
nl_outputs = model_eval.nl_encoder_q(nl_inputs, attention_mask=nl_inputs.ne(1))
nl_vec =nl_outputs [1]
code_outputs = model_eval.code_encoder_q(code_inputs, attention_mask=code_inputs.ne(1))
code_vec =code_outputs [1]
all_nl_vec.append(nl_vec.detach().cpu().numpy())
all_code_vec.append(code_vec.detach().cpu().numpy())
all_nl_vec = np.concatenate(all_nl_vec, axis=0)
all_code_vec = np.concatenate(all_code_vec, axis=0)
print(all_nl_vec.shape, all_code_vec.shape)
np.save("/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/" + str(idx) + "/test_all_nl_vec.npy", all_nl_vec)
np.save("/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/" + str(idx) + "/test_all_code_vec.npy", all_code_vec)
def gen_label():
args = parse_args()
#set log
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO )
#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device
logger.info("device: %s, n_gpu: %s",device, args.n_gpu)
pool = multiprocessing.Pool(cpu_cont)
# # Set seed
# set_seed(args.seed)
# if "codet5" in args.model_name_or_path:
# config = T5Config.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
# tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
# model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
# model = model.encoder
# else:
# config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
# tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
# model = RobertaModel.from_pretrained(args.model_name_or_path)
# model=create_model(args,model,tokenizer,config)
# if args.data_aug_type == "replace_type" :
# train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
# else:
# # if "unixcoder" in args.model_name_or_path or "coco" in args.model_name_or_path :
# train_dataset=TextDataset_unixcoder(tokenizer, args, args.train_data_file, pool)
# # else:
# # train_dataset=TextDataset(tokenizer, args, args.train_data_file, pool)
# train_sampler = SequentialSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4,drop_last=False)
code_list = []
docstring_list = []
with open(args.train_data_file, 'rt') as gz_file:
for line in gz_file:
data = json.loads(line)
code = data['code']
docstring = data['docstring']
# 将 code 和 docstring 添加到列表中
code_list.append(code)
docstring_list.append(docstring)
print(len(code_list))
print(len(docstring_list))
# print(code_list[0])
# print(docstring_list[0])
code_output_file = '/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/code_list.json'
docstring_output_file = '/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/docstring_list.json'
# 存储 code_list 到 JSON 文件
with open(code_output_file, 'w') as file:
json.dump(code_list, file)
# 存储 docstring_list 到 JSON 文件
with open(docstring_output_file, 'w') as file:
json.dump(docstring_list, file)
code_list = []
docstring_list = []
with open(args.test_data_file, 'rt') as gz_file:
for line in gz_file:
data = json.loads(line)
code = data['code']
docstring = data['docstring']
# 将 code 和 docstring 添加到列表中
code_list.append(code)
docstring_list.append(docstring)
print(len(code_list))
print(len(docstring_list))
code_output_file = '/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/test_code_list.json'
docstring_output_file = '/home/yiming/cocosoda/CoCoSoDa/saved_models/fine_tune/ruby/test_docstring_list.json'
# 存储 code_list 到 JSON 文件
with open(code_output_file, 'w') as file:
json.dump(code_list, file)
# 存储 docstring_list 到 JSON 文件
with open(docstring_output_file, 'w') as file:
json.dump(docstring_list, file)
if __name__ == "__main__":
# main()
# gen_vector()
gen_label()