In [None]:
# !pip install transformers

In [8]:
from __future__ import absolute_import
import torch
import logging
import torch.nn as nn
from model import Seq2Seq
from transformers import (
 RobertaConfig, 
 RobertaModel, 
 RobertaTokenizer
)

import regex as re

# disable warnings
import warnings
warnings.filterwarnings("ignore")

# base model is RoBERTa
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

# initialize logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 datefmt = '%m/%d/%Y %H:%M:%S',
 level = logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
class CONFIG:
 max_source_length = 256
 max_target_length = 128
 beam_size = 10
 local_rank = -1
 no_cuda = False

 do_train = True
 do_eval = True
 do_test = True
 train_batch_size = 12
 eval_batch_size = 32

 model_type = "roberta"
 model_name_or_path = "microsoft/codebert-base"
 output_dir = "/content/drive/MyDrive/CodeSummarization"
 load_model_path = None
 train_filename = "dataset/python/train.jsonl"
 dev_filename = "dataset/python/valid.jsonl"
 test_filename = "dataset/python/test.jsonl"
 config_name = ""
 tokenizer_name = ""
 cache_dir = "cache"

 save_every = 5000

 gradient_accumulation_steps = 1
 learning_rate = 5e-5
 weight_decay = 1e-4
 adam_epsilon = 1e-8
 max_grad_norm = 1.0
 num_train_epochs = 3.0
 max_steps = -1
 warmup_steps = 0
 train_steps = 100000
 eval_steps = 10000
 n_gpu = torch.cuda.device_count()

## Load tokenizer

In [4]:
import logging
from transformers import RobertaTokenizer
logger = logging.getLogger(__name__)
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', cache_dir=CONFIG.cache_dir)

print(f'{tokenizer.cls_token} index: {tokenizer.cls_token_id}')
print(f'{tokenizer.sep_token} index: {tokenizer.sep_token_id}')
print(f'{tokenizer.pad_token} index: {tokenizer.pad_token_id}')
print(f'{tokenizer.mask_token} index: {tokenizer.mask_token_id}') 

Downloading: 0%| | 0.00/899k [00:00, ?B/s]

Downloading: 0%| | 0.00/456k [00:00, ?B/s]

Downloading: 0%| | 0.00/150 [00:00, ?B/s]

Downloading: 0%| | 0.00/25.0 [00:00, ?B/s]

Downloading: 0%| | 0.00/498 [00:00, ?B/s]

 index: 0
 index: 2
 index: 1
 index: 50264


In [None]:
input_str = "def sina_xml_to_url_list(xml_data):\n \"\"\"str->list\n Convert XML to URL List.\n From Biligrab.\n \"\"\"\n rawurl = []\n dom = parseString(xml_data)\n for node in dom.getElementsByTagName('durl'):\n url = node.getElementsByTagName('url')[0]\n rawurl.append(url.childNodes[0].data)\n return rawurl"
input_tokens = tokenizer.tokenize(input_str)
print(input_tokens)

In [46]:
def preprocessing(code_segment):
 
 # remove newlines
 code_segment = re.sub(r'\n', ' ', code_segment)
 
 # remove docstring
 code_segment = re.sub(r'""".*?"""', '', code_segment, flags=re.DOTALL)
 
 # remove multiple spaces
 code_segment = re.sub(r'\s+', ' ', code_segment)
 
 # remove comments
 code_segment = re.sub(r'#.*', '', code_segment)

 # remove html tags
 code_segment = re.sub(r'<.*?>', '', code_segment)

 # remove urls
 code_segment = re.sub(r'http\S+', '', code_segment)
 
 # split special chars into different tokens
 code_segment = re.sub(r'([^\w\s])', r' \1 ', code_segment)
 
 return code_segment.split()

preprocessing(input_str)

['def',
 'sina_xml_to_url_list',
 '(',
 'xml_data',
 ')',
 ':',
 'rawurl',
 '=',
 '[',
 ']',
 'dom',
 '=',
 'parseString',
 '(',
 'xml_data',
 ')',
 'for',
 'node',
 'in',
 'dom',
 '.',
 'getElementsByTagName',
 '(',
 "'",
 'durl',
 "'",
 ')',
 ':',
 'url',
 '=',
 'node',
 '.',
 'getElementsByTagName',
 '(',
 "'",
 'url',
 "'",
 ')',
 '[',
 '0',
 ']',
 'rawurl',
 '.',
 'append',
 '(',
 'url',
 '.',
 'childNodes',
 '[',
 '0',
 ']',
 '.',
 'data',
 ')',
 'return',
 'rawurl']

In [48]:
input_str = "def get_data():\n data = []\n for i in range(10):\n data.append(i)\n return data"
input_tokens = preprocessing(input_str)
print(f'Tokens = {input_tokens}')
# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors="pt")

Tokens = ['def', 'get_data', '(', ')', ':', 'data', '=', '[', ']', 'for', 'i', 'in', 'range', '(', '10', ')', ':', 'data', '.', 'append', '(', 'i', ')', 'return', 'data']


In [27]:
input_str = "def sina_xml_to_url_list(xml_data):\n \"\"\"str->list\n Convert XML to URL List.\n From Biligrab.\n \"\"\"\n rawurl = []\n dom = parseString(xml_data)\n for node in dom.getElementsByTagName('durl'):\n url = node.getElementsByTagName('url')[0]\n rawurl.append(url.childNodes[0].data)\n return rawurl"
input_tokens = preprocessing(input_str)
print(f'Tokens = {input_tokens}')
# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors="pt")

Tokens = ['def', 'sina_xml_to_url_list', '(', 'xml_data', ')', ':', 'rawurl', '=', '[', ']', 'dom', '=', 'parseString', '(', 'xml_data', ')', 'for', 'node', 'in', 'dom', '.', 'getElementsByTagName', '(', "'", 'durl', "'", ')', ':', 'url', '=', 'node', '.', 'getElementsByTagName', '(', "'", 'url', "'", ')', '[', '0', ']', 'rawurl', '.', 'append', '(', 'url', '.', 'childNodes', '[', '0', ']', '.', 'data', ')', 'return', 'rawurl']


{'input_ids': tensor([[ 0, 9232, 3, 1640, 3, 43, 35, 3, 5214, 10975,
 742, 12623, 5214, 3, 1640, 3, 43, 1990, 46840, 179,
 12623, 4, 3, 1640, 108, 3, 108, 43, 35, 6423,
 5214, 46840, 4, 3, 1640, 108, 6423, 108, 43, 10975,
 288, 742, 3, 4, 48696, 1640, 6423, 4, 3, 10975,
 288, 742, 4, 23687, 43, 30921, 3, 2, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [43]:
encoded_input = tokenizer.encode_plus(
 input_tokens, 
 max_length=CONFIG.max_source_length, 
 pad_to_max_length=True, 
 truncation=True, 
 return_tensors="pt"
)
print(encoded_input)

{'input_ids': tensor([[ 0, 9232, 3, 1640, 43, 35, 23687, 5214, 10975, 742,
 1990, 118, 179, 9435, 1640, 698, 43, 35, 23687, 4,
 48696, 1640, 118, 43, 30921, 23687, 2, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 0, 0, 0, 0,

## Load model

In [51]:
# Config model
config_class, model_class, tokenizer_class = (RobertaConfig, RobertaModel, RobertaTokenizer)
model_config = config_class.from_pretrained(CONFIG.config_name if CONFIG.config_name else CONFIG.model_name_or_path, cache_dir=CONFIG.cache_dir)
model_config.save_pretrained('config')

# load tokenizer
tokenizer = tokenizer_class.from_pretrained(
 CONFIG.tokenizer_name if CONFIG.tokenizer_name else CONFIG.model_name_or_path,
 cache_dir=CONFIG.cache_dir,
 # do_lower_case=args.do_lower_case
)

# load encoder from pretrained RoBERTa
encoder = model_class.from_pretrained(CONFIG.model_name_or_path, config=model_config, cache_dir=CONFIG.cache_dir) 

# build decoder 
decoder_layer = nn.TransformerDecoderLayer(d_model=model_config.hidden_size, nhead=model_config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)

# build seq2seq model from pretrained encoder and from-scratch decoder
model=Seq2Seq(
 encoder=encoder,
 decoder=decoder,
 config=model_config,
 beam_size=CONFIG.beam_size,
 max_length=CONFIG.max_target_length,
 sos_id=tokenizer.cls_token_id,
 eos_id=tokenizer.sep_token_id
)

In [52]:
state_dict = torch.load("./models/pytorch_model.bin")
model.load_state_dict(state_dict)



## Prediction

In [53]:
# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() and not CONFIG.no_cuda else "cpu")
model = model.to(device)

In [54]:
input_str = "def get_data():\n data = []\n for i in range(10):\n data.append(i)\n return data"
input_tokens = preprocessing(input_str)
encoded_input = tokenizer.encode_plus(
 input_tokens, 
 max_length=CONFIG.max_source_length, 
 pad_to_max_length=True, 
 truncation=True, 
 return_tensors="pt"
)
print(encoded_input)

input_ids = encoded_input["input_ids"].to(device)
input_mask = encoded_input["attention_mask"].to(device)


{'input_ids': tensor([[ 0, 9232, 3, 1640, 43, 35, 23687, 5214, 10975, 742,
 1990, 118, 179, 9435, 1640, 698, 43, 35, 23687, 4,
 48696, 1640, 118, 43, 30921, 23687, 2, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 0, 0, 0, 0,

In [59]:
output = model(input_ids, input_mask)
print(f'Summary.shape = {output.shape}')
print(f'Summary = {output}')

Summary.shape = torch.Size([1, 10, 128])
Summary = tensor([[[42555, 10, 889, ..., 0, 0, 0],
 [42555, 10, 889, ..., 0, 0, 0],
 [42555, 10, 889, ..., 0, 0, 0],
 ...,
 [42555, 10, 889, ..., 0, 0, 0],
 [42555, 10, 889, ..., 0, 0, 0],
 [42555, 10, 889, ..., 0, 0, 0]]], device='cuda:0')


In [61]:
# decode summary with tokenizer
summary = output[0]
for i in range(10):
 print(f'{summary[i].shape}')
 pred = tokenizer.decode(summary[i], skip_special_tokens=True)
 print(pred)

torch.Size([128])
Return a list of data.
torch.Size([128])
Return a list of int values.
torch.Size([128])
Return a list of ints.
torch.Size([128])
Return a list of ints
torch.Size([128])
Return a list of the number of integers.
torch.Size([128])
Return a list of the number of data.
torch.Size([128])
Return a list of the number of digits.
torch.Size([128])
Return a list of the number of numbers.
torch.Size([128])
Return a list of data in a list.
torch.Size([128])
Return a list of data in a list of data
