""" This script provides an example to wrap TencentPretrain for ChID (a multiple choice dataset). """ import sys import os import argparse import json import random import torch tencentpretrain_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(tencentpretrain_dir) from tencentpretrain.utils.constants import * from tencentpretrain.utils.tokenizers import * from tencentpretrain.utils.optimizers import * from tencentpretrain.utils import * from tencentpretrain.utils.config import load_hyperparam from tencentpretrain.utils.seed import set_seed from tencentpretrain.utils.logging import init_logger from tencentpretrain.model_saver import save_model from tencentpretrain.opts import finetune_opts, adv_opts from finetune.run_c3 import MultipleChoice from finetune.run_classifier import build_optimizer, load_or_initialize_parameters, train_model, batch_loader, evaluate def tokenize_chid(text): output = [] first_idiom = True while True: if first_idiom: idiom_index = text.find("#idiom") output.extend(text[:idiom_index]) output.append(text[idiom_index : idiom_index + 13]) pre_idiom_index = idiom_index first_idiom = False else: if text[idiom_index + 1 :].find("#idiom") == -1: output.extend(text[pre_idiom_index + 13 :]) break else: idiom_index = idiom_index + 1 + text[idiom_index + 1 :].find("#idiom") output.extend(text[pre_idiom_index + 13 : idiom_index]) output.append(text[idiom_index : idiom_index + 13]) pre_idiom_index = idiom_index return output def add_tokens_around(tokens, idiom_index, tokens_num): left_tokens_num = tokens_num // 2 right_tokens_num = tokens_num - left_tokens_num if idiom_index >= left_tokens_num and (len(tokens) - 1 - idiom_index) >= right_tokens_num: left_tokens = tokens[idiom_index - left_tokens_num : idiom_index] right_tokens = tokens[idiom_index + 1 : idiom_index + 1 + right_tokens_num] elif idiom_index < left_tokens_num: left_tokens = tokens[:idiom_index] right_tokens = tokens[idiom_index + 1 : idiom_index + 1 + tokens_num - len(left_tokens)] elif (len(tokens) - 1 - idiom_index) < right_tokens_num: right_tokens = tokens[idiom_index + 1 :] left_tokens = tokens[idiom_index - (tokens_num - len(right_tokens)) : idiom_index] return left_tokens, right_tokens def read_dataset(args, data_path, answer_path): if answer_path is not None: answers = json.load(open(answer_path)) dataset = [] max_tokens_for_doc = args.seq_length - 3 group_index = 0 for line in open(data_path, mode="r", encoding="utf-8"): example = json.loads(line) options = example["candidates"] for context in example["content"]: chid_tokens = tokenize_chid(context) tags = [token for token in chid_tokens if "#idiom" in token] for tag in tags: if answer_path is not None: tgt = answers[tag] else: tgt = -1 tokens = [] for i, token in enumerate(chid_tokens): if "#idiom" in token: sub_tokens = [str(token)] else: sub_tokens = args.tokenizer.tokenize(token) for sub_token in sub_tokens: tokens.append(sub_token) idiom_index = tokens.index(tag) left_tokens, right_tokens = add_tokens_around(tokens, idiom_index, max_tokens_for_doc - 1) for i in range(len(left_tokens)): if "#idiom" in left_tokens[i] and left_tokens[i] != tag: left_tokens[i] = MASK_TOKEN for i in range(len(right_tokens)): if "#idiom" in right_tokens[i] and right_tokens[i] != tag: right_tokens[i] = MASK_TOKEN dataset.append(([], tgt, [], tag, group_index)) for option in options: option_tokens = args.tokenizer.tokenize(option) tokens = [CLS_TOKEN] + option_tokens + [SEP_TOKEN] + left_tokens + [SEP_TOKEN] + right_tokens + [SEP_TOKEN] src = args.tokenizer.convert_tokens_to_ids(tokens)[: args.seq_length] seg = [0] * len(src) while len(src) < args.seq_length: src.append(0) seg.append(0) dataset[-1][0].append(src) dataset[-1][2].append(seg) while len(dataset[-1][0]) < args.max_choices_num: dataset[-1][0].append([0] * args.seq_length) dataset[-1][2].append([0] * args.seq_length) group_index += 1 return dataset def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--train_answer_path", type=str, required=True, help="Path of the answers for trainset.") parser.add_argument("--dev_answer_path", type=str, required=True, help="Path of the answers for devset.") parser.add_argument("--max_choices_num", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") adv_opts(parser) args = parser.parse_args() args.labels_num = args.max_choices_num # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path, args.train_answer_path) instances_num = len(trainset) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info("The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model if args.use_adv: args.adv_method = str2adv[args.adv_type](model) total_loss, result, best_result = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, read_dataset(args, args.dev_path, args.dev_answer_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) if __name__ == "__main__": main()