from datasets import load_dataset from sympy import Line2D from transformers import ( AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForMaskedLM, TrainingArguments, Trainer, pipeline, ) import evaluate import numpy as np import torch import math device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) class MaskedLM(): def __init__(self): self.model = None self.metric = None self.data_collator = None self.raw_data = None self.model_checkpoint = None self.tokenized_dataset = None self.chunk_size = 128 self.chunks_dataset = None self.split_dataset = None self.args = None def load_dataset(self, name="imdb"): self.raw_data = load_dataset(name) print("Name of dataset: ", name) print(self.raw_data) def load_support(self, mlm_probability=0.15): self.model_checkpoint = "distilbert-base-uncased" self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint) self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm_probability=mlm_probability) print("Name of model checkpoint: " + self.model_checkpoint) print("Tokenizer Fast: ", self.tokenizer.is_fast) print("Symbol of masked word after tokenizer: ", self.tokenizer.mask_token) print("Model max length tokenizer: ", self.tokenizer.model_max_length) def explore_infoModel(self, k=5): model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint) model_parameters = model.num_parameters() / 1000000 print(f">>> Number of parameters of {self.model_checkpoint}: {round(model_parameters)}M") example = "This is a great [MASK]." print("\n") print(">>> Example: ", example) inputs = self.tokenizer(example, return_tensors='pt') token_logits = model(**inputs).logits print(f"{'Number of tokens: ':<{30}}{ len(inputs.tokens())}") print(f"{'Tokens prepare for training: ':<{30}}{ inputs.tokens()}") print(f"{'IDs of tokens: ':<{30}}{ inputs.input_ids}") print(f"{'Maked IDs of Model: ':<{30}}{self.tokenizer.mask_token_id}") print(f"{'Logits of example: ':<{30}}{token_logits}") print(f"{'Shape of logits: ':<{30}}{token_logits.size()}") '''Find POSITION of [MASK] => EXTRACT LOGIT''' mask_token_index = torch.where(inputs.input_ids == self.tokenizer.mask_token_id)[1] print(f"{'Position of masked token index: ':<{30}}{mask_token_index}") '''Find LOGIT of token in VOCAB suitable for [MASK]''' mask_token_logits = token_logits[0, mask_token_index, :] print(f"{'Logit of tokens in Vocab for [MASK]: ':<{30}}{mask_token_logits}") '''Choose TOP CANDIDATES for [MASK] with highest logits => TOP LOGITS + POSITION of token suitable for [MASK] in VOCAB''' top_k_values = torch.topk(mask_token_logits, k, dim=1).values[0].tolist() print(f"{'Top value of suitable token in Vocab: ':<{30}}{top_k_values }") top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist() print(f"{'Position of suitable token in Vocab: ':<{30}}{top_k_tokens}") '''Show TOP CANDIDATES''' for token in top_k_tokens: print(">>> ", example.replace(self.tokenizer.mask_token, self.tokenizer.decode([token]))) def get_feature_items(self, set="train", index=0, feature="text"): return None if self.raw_data[set][index][feature] is None or self.raw_data[set][index][feature] == 0 else self.raw_data[set][index][feature] def get_pair_items(self, set="train", index=0, feature1="text", feature2="label"): feature1 = self.get_feature_items(set, index, feature1) feature2 = self.get_feature_items(set, index, feature2) if feature2 is not None: line1 = "" line2 = "" for word, label in zip(feature1, feature2): line1 += str(word) line2 += str(label) return line1, line2 return feature1, feature1 def get_tokenizer(self, set="train", index=0, feature="text"): inputs = self.tokenizer(self.get_feature_items(set, index, feature)) return inputs.tokens(), inputs.word_ids() def tokenizer_dataset(self, example): inputs = self.tokenizer(example["text"]) inputs["word_ids"] = [inputs.word_ids(i) for i in range(len(inputs["input_ids"]))] return inputs def map_tokenize_dataset(self): print("Start of processing dataset") self.tokenized_dataset = self.raw_data.map(self.tokenizer_dataset, batched=True, remove_columns=["text","label"] ) print("Done mapping") print("Tokenized dataset: ", self.tokenized_dataset) def group_text_chunk(self, example): '''Group all of text''' concatenate_example = {k : sum(example[k], []) for k in example.keys()} '''Compute the length of all''' total_length = len(concatenate_example["input_ids"]) '''Final length for chunk size''' total_length = (total_length // self.chunk_size) *self.chunk_size '''Divide into chunks with chunk size''' chunks = { k: [t[i: i + self.chunk_size] for i in range(0, total_length, self.chunk_size)] for k, t in concatenate_example.items() } '''Create LABELS column from INPUT_IDS''' chunks["labels"] = chunks["input_ids"].copy() return chunks def map_chunk_dataset(self): print("Start of processing dataset") self.chunks_dataset = self.tokenized_dataset.map(self.group_text_chunk, batched=True) print("Done mapping") print("Chunked dataset: ", self.chunks_dataset) def dataset_split(self, test_size=0.2): self.split_dataset = self.chunks_dataset["train"].train_test_split( test_size=test_size, seed=42 ) print("Preparing dataset: ", self.split_dataset) def create_model(self): print("Start creating model") self.model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint) print(self.model) def create_argumentTrainer(self, output_dir="fine_tuned_", eval_strategy="epoch", logging_strategy="epoch", learning_rate=2e-5, num_train_epochs=20, weight_decay=0.01, batch_size=64, save_strategy="epoch", push_to_hub=False, hub_model_id="", fp16=True): logging_steps = len(self.split_dataset["train"]) // batch_size self.args= TrainingArguments( #use_cpu=True, output_dir=f"{output_dir}{self.model_checkpoint}", overwrite_output_dir=True, eval_strategy=eval_strategy, save_strategy=save_strategy, weight_decay=weight_decay, learning_rate=learning_rate, num_train_epochs=num_train_epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, push_to_hub=push_to_hub, hub_model_id=hub_model_id, fp16=fp16, logging_steps=logging_steps ) print("Arguments ready for training") return self.args def call_train(self, model_path="pretrained_model_", set_train="train", set_val="test", push_to_hub=False, save_local=False): trainer = Trainer( model=self.model, args=self.args, train_dataset=self.split_dataset[set_train], eval_dataset=self.split_dataset[set_val], data_collator=self.data_collator, tokenizer=self.tokenizer, ) eval_result1 = trainer.evaluate() print("Perplexity before of training: ", math.exp(eval_result1['eval_loss'])) print("Start training") trainer.train() print("Done training") eval_result2 = trainer.evaluate() print("Perplexity after of training: ", math.exp(eval_result2['eval_loss'])) if save_local: trainer.save_model(model_path+self.model_checkpoint) print("Done saving to local") if push_to_hub: trainer.push_to_hub(commit_message="Training complete") print("Done pushing push to hub") def call_pipeline(self, local=False, path="", example=""): if local: model_checkpoint = "" else: model_checkpoint = path mask_filler = pipeline( "fill-mask", model=model_checkpoint, ) print(mask_filler(example)) if __name__ == "__main__": ''' 1_LOADING DATASET ''' mlm = MaskedLM() mlm.load_dataset() print("-"*50, "Exploring information of Supporting", "-"*50) mlm.load_support() print("-"*50, "Exploring information of Supporting", "-"*50) ''' 2_EXPLORING DATASET, MODEL ''' print("-"*50, "Exploring some information of Model", "-"*50) mlm.explore_infoModel() print("-"*50, "Exploring some information of Model", "-"*50) print("Example[0] (text) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="text")[:100] + "...") print("Example[0] (label) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="label")) line1, line2 = mlm.get_pair_items(set="train", index=1, feature1="text", feature2="label") print("--> Inp of Example[1]: ", line1[:20] + "...") print("--> Out of Example[1]: ", line2[:20]+ "...") ''' 3_PRE-PROCESSING DATASET, COMPUTE METRICS ''' tokens, word_ids = mlm.get_tokenizer(set="train", index=0, feature="text") print("Tokens List of Example 0: ",tokens) print("Word IDs List of Example 0: ",word_ids) mlm.map_tokenize_dataset() mlm.map_chunk_dataset() mlm.dataset_split() ''' 4_INITIALIZATION MODEL ''' print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50) mlm.create_model() print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50) ''' 5_SELECTION HYPERPARMETERS ''' mlm.create_argumentTrainer(push_to_hub=True, hub_model_id="Chessmen/"+"fine_tune_" + mlm.model_checkpoint) mlm.call_train(save_local=True,push_to_hub=True) ''' 6_USE PRE-TRAINED MODEL ''' mlm.call_pipeline(path="Chessmen/fine_tune_distilbert-base-uncased",example= "This is a great [MASK].")