Initial Commit

64ae4c7 unverified almost 3 years ago

19 kB

	# This training script is a duplicate of the Training.ipynb notebook but can be invoked from the terminal

	import os
	print(os.getcwd())
	os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH")

	os.system('pip uninstall -y torch')
	os.system('pip uninstall -y einops')
	os.system('pip uninstall -y transformers')
	os.system('pip uninstall -y sentence_transformers')
	os.system('pip uninstall -y datasets')
	os.system('pip uninstall -y sagemaker')
	os.system('pip uninstall -y smart_open')
	os.system('pip uninstall -y pynvml')

	os.system('pip install -r lodestone-reqs.txt')

	os.system('pip install -e ./sentence-transformers')

	os.system('pip uninstall -y triton')
	os.system('pip install --no-deps triton==2.0.0.dev20221202')

	#####

	from pynvml import *
	import math
	from sentence_transformers import models, losses
	from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
	import logging
	import os
	import json
	import torch
	import boto3
	from smart_open import open
	import random
	import time
	import gc

	os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH")
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	#####


	def print_gpu_utilization():
	"This helper function outputs the current GPU memory usage."
	nvmlInit()
	handle = nvmlDeviceGetHandleByIndex(0)
	info = nvmlDeviceGetMemoryInfo(handle)
	return f"GPU memory occupied: {info.used/1024**3} GB."

	#####


	class MultiDatasetDataLoader:
	"""
	This custom dataloader class consumes a list of datasets and a batch size and produces batches randomly sampled
	from the datasets provided where each batch consists of records from a single dataset and datasets are chosen
	for batches in proportion to their total number of records.
	"""
	def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1, allow_swap=True):
	self.allow_swap = allow_swap
	self.batch_size_pairs = batch_size_pairs
	self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets

	# Compute dataset weights
	self.dataset_lengths = list(map(len, datasets))
	self.dataset_lengths_sum = sum(self.dataset_lengths)

	weights = []
	# if dataset_size_temp > 0: # Scale probability with dataset size
	# for dataset in datasets:
	# prob = len(dataset) / self.dataset_lengths_sum
	# weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000)))
	# else: # Equal weighting of all datasets
	# weights = [100] * len(datasets)
	for dataset in datasets:
	weights.append(len(dataset))

	# logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights))))

	self.dataset_idx = []
	self.dataset_idx_pointer = 0

	for idx, weight in enumerate(weights):
	self.dataset_idx.extend([idx] * weight)
	random.shuffle(self.dataset_idx)

	self.datasets = []
	for dataset in datasets:
	random.shuffle(dataset)
	self.datasets.append({
	'elements': dataset,
	'pointer': 0,
	})

	def __iter__(self):
	for _ in range(int(self.__len__())):
	# Select dataset
	if self.dataset_idx_pointer >= len(self.dataset_idx):
	self.dataset_idx_pointer = 0
	random.shuffle(self.dataset_idx)

	dataset_idx = self.dataset_idx[self.dataset_idx_pointer]
	self.dataset_idx_pointer += 1

	# Select batch from this dataset
	dataset = self.datasets[dataset_idx]
	batch_size = self.batch_size_pairs if len(dataset['elements'][0].texts) == 2 else self.batch_size_triplets

	batch = []
	texts_in_batch = set()
	guid_in_batch = set()
	while len(batch) < batch_size:
	example = dataset['elements'][dataset['pointer']]

	valid_example = True
	# First check if one of the texts in already in the batch
	for text in example.texts:
	text_norm = text.strip().lower()
	if text_norm in texts_in_batch:
	valid_example = False

	texts_in_batch.add(text_norm)

	# If the example has a label, check if label is in batch
	if example.guid is not None:
	valid_example = valid_example and example.guid not in guid_in_batch
	guid_in_batch.add(example.guid)

	if valid_example:
	if self.allow_swap and random.random() > 0.5:
	example.texts[0], example.texts[1] = example.texts[1], example.texts[0]

	batch.append(example)

	dataset['pointer'] += 1
	if dataset['pointer'] >= len(dataset['elements']):
	dataset['pointer'] = 0
	random.shuffle(dataset['elements'])

	yield self.collate_fn(batch) if self.collate_fn is not None else batch

	def __len__(self):
	return int(self.dataset_lengths_sum / self.batch_size_pairs)

	#####


	# These four classes of custom generators parse the raw data from the files in S3 and format it into InputExamples which can be properly interpreted by a SentenceTransformer model.

	class RedditTitleBodyDataset:
	def __init__(self, source_uri, max_seq_length):
	self.source_uri = source_uri
	self.s3_client = boto3.client("s3")
	self.max_seq_length = max_seq_length

	def __iter__(self):
	while True:
	for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
	data_line = json.loads(json_line.strip())

	if "title" in data_line and "body" in data_line:
	data = {'guid': None, 'texts': [" ".join(data_line['title'].split(" ")[:self.max_seq_length]), " ".join(data_line['body'].split(" ")[:self.max_seq_length])]}
	record = InputExample(guid=data.get('guid', None), texts=data['texts'])

	yield record


	class RedditYearDataset:
	def __init__(self, source_uri, max_seq_length):
	self.source_uri = source_uri
	self.s3_client = boto3.client("s3")
	self.max_seq_length = max_seq_length

	def __iter__(self):
	while True:
	for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
	data_line = json.loads(json_line.strip())

	if "response" in data_line and "context" in data_line:
	data = {'guid': None, 'texts': [" ".join(data_line['response'].split(" ")[:self.max_seq_length]), " ".join(data_line['context'].split(" ")[:self.max_seq_length])]}
	record = InputExample(guid=data.get('guid', None), texts=data['texts'])

	yield record


	class HuggingFaceQueryPosDataset:
	def __init__(self, source_uri, max_seq_length):
	self.source_uri = source_uri
	self.s3_client = boto3.client("s3")
	self.max_seq_length = max_seq_length

	def __iter__(self):
	while True:
	for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
	data_line = json.loads(json_line.strip())

	if "query" in data_line and "pos" in data_line:
	for i in range(len(data_line['pos'])):
	data = {'guid': None, 'texts': [" ".join(data_line['query'].split(" ")[:self.max_seq_length]), " ".join(data_line['pos'][i].split(" ")[:self.max_seq_length])]}
	record = InputExample(guid=data.get('guid', None), texts=data['texts'])

	yield record


	class Dataset:
	def __init__(self, source_uri, max_seq_length):
	self.source_uri = source_uri
	self.s3_client = boto3.client("s3")
	self.max_seq_length = max_seq_length

	def __iter__(self):
	while True:
	for json_line in open(self.source_uri, transport_params={"client": self.s3_client}):
	data_line = json.loads(json_line.strip())

	if not isinstance(data_line, dict):
	data = {'guid': None, 'texts': data_line}
	for text_idx in range(len(data['texts'])):
	data['texts'][text_idx] = " ".join(data['texts'][text_idx].split(" ")[:self.max_seq_length])
	record = InputExample(guid=data.get('guid', None), texts=data['texts'])
	else:
	for text_idx in range(len(data_line['texts'])):
	data_line['texts'][text_idx] = " ".join(data_line['texts'][text_idx].split(" ")[:self.max_seq_length])
	record = InputExample(guid=data_line.get('guid', None), texts=data_line['texts'])

	yield record

	#####


	def build_generators(data_records, max_seq_length=512, testing=False):
	"""
	This function consumes the data_records dictionary and creates a new dictionary of data generators where each entry is
	of the form {filename: data generator object}.
	"""
	if testing:
	# filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]
	filepaths = [file for file in list(data_records.keys())][:3]
	else:
	filepaths = list(data_records.keys())
	generators = {}
	for filepath in filepaths:
	filepath = filepath.strip()
	source_uri = 's3://lodestone-rnd/data/'+filepath
	if filepath in ['S2ORC_citations_abstracts.json.gz', 'amazon-qa.json.gz'] or 'reddit' in filepath:
	if "title" in filepath:
	generators[f'{filepath.split(".")[0]}'] = iter(RedditTitleBodyDataset(source_uri, max_seq_length))
	elif "reddit" in filepath:
	generators[f'{filepath.split(".")[0]}'] = iter(RedditYearDataset(source_uri, max_seq_length))
	else:
	generators[f'{filepath.split(".")[0]}'] = iter(HuggingFaceQueryPosDataset(source_uri, max_seq_length))
	else:
	generators[f'{filepath.split(".")[0]}'] = iter(Dataset(source_uri, max_seq_length))

	return generators

	#####


	def produce_data(data_records, num_chunks, generators, batch_size, failed_on=None, first_iter=False, testing=False, temp=-1):
	"""
	This function consumes the data_records dictionary, the number of chunks to break the datasets into, the dictionary of
	data generators, and a batch size and returns a MultiDatasetDataloader which can be fed into the .fit method of a
	SentenceTransformer model.
	"""
	if testing:
	# filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]
	filepaths = [file for file in list(data_records.keys())][:3]
	else:
	filepaths = list(data_records.keys())
	datasets = []
	for file_idx, filepath in enumerate(filepaths):
	filepath = filepath.strip()
	dataset = []

	if failed_on is not None and failed_on != 1 and first_iter:
	for k in range((failed_on-1)*max(1, data_records[filepath]//num_chunks)):
	next(generators[f'{filepath.split(".")[0]}'])
	for m in range(max(1, data_records[filepath]//num_chunks)):
	dataset.append(next(generators[f'{filepath.split(".")[0]}']))
	else:
	for n in range(max(1, data_records[filepath]//num_chunks)):
	dataset.append(next(generators[f'{filepath.split(".")[0]}']))

	datasets.append(dataset)
	logging.info("{}. {}: {}".format(file_idx+1, filepath, len(dataset)))

	dataset_lengths_sum = sum(list(map(len, datasets)))

	batch_size_pairs = batch_size_triplets = batch_size
	# Special data loader to load from multiple datasets
	train_dataloader = MultiDatasetDataLoader(datasets=datasets,
	batch_size_pairs=batch_size_pairs,
	batch_size_triplets=batch_size_triplets,
	dataset_size_temp=temp)

	return train_dataloader, dataset_lengths_sum

	#####


	def construct_model(model_name, max_seq_length=512):
	"""
	This function constructs a SentenceTransformer model from a HuggingFace transformer model name
	or from a local path to a transformer model repository.
	"""
	word_embedding_model = models.Transformer(model_name_or_path=model_name,
	max_seq_length=max_seq_length,
	tokenizer_name_or_path='bert-base-uncased',
	trust_remote_code=True,
	model_args={'torch_dtype': torch.bfloat16})
	pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
	norm = models.Normalize()
	model = SentenceTransformer(modules=[word_embedding_model, pooling_model, norm], device='cuda')
	model[0].tokenizer.model_max_length = max_seq_length

	return model

	#####


	# Just some code to print debug information to stdout
	logging.basicConfig(format='%(asctime)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	level=logging.INFO,
	handlers=[LoggingHandler()])
	# /print debug information to stdout

	#####


	# Set Hyperparameters
	model_name = 'mosaic-bert-base-seqlen-2048'
	# model_name = 'hum-lodestone-v1'
	batch_size = 16
	batch_size_pairs = batch_size_triplets = batch_size
	max_seq_length = 2048
	use_amp = False

	num_cycles = 2
	num_chunks = 50
	num_epochs = 2
	steps_per_epoch = 10000
	# Total training steps = num_cycles * num_chunks * num_epochs * steps_per_epoch = 2 * 50 * 2 * 10,000 = 2,000,000 steps
	warmup_steps = 500

	testing = False
	temp = -1

	#####


	output_path = 'hum-lodestone-v1'
	logging.info("Output: "+output_path)

	# Instantiate SentenceTransformer Model
	model = construct_model(model_name=model_name, max_seq_length=max_seq_length)

	# Load File Names and Record Volumes
	with open('data_records.json') as fIn:
	data_records = json.load(fIn)

	total_pairs = sum(data_records.values())

	logging.info("Total Training Pairs: {}".format(total_pairs))

	# Initialize Data Generators
	generators = build_generators(data_records=data_records,
	max_seq_length=max_seq_length,
	testing=testing)

	logging.info("Data Generators Initialized")

	# Define Training Loss Function
	train_loss = losses.MultipleNegativesRankingLoss(model,
	scale=20,
	similarity_fct=util.dot_score)

	logging.info(print_gpu_utilization())

	#####


	# Configure Training Cycles
	failed_on = None # chunk that the process failed on
	random.seed(42)
	steps = 0
	first_iter = True
	for cycle_num in range(num_cycles):
	logging.info("Starting Cycle {}".format(cycle_num+1))
	for chunk_num in range(num_chunks):
	if failed_on is not None and (chunk_num+1) < failed_on and (cycle_num+1) == 1:
	pass
	else:
	logging.info("Chunk {}/{}".format(chunk_num+1, num_chunks))
	logging.info("Loading {} Datasets".format(len([file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]) if testing else len(data_records)))
	# t_dataload0 = time.time()
	# Create the training dataloader for the given chunk of data
	train_dataloader, dataset_lengths_sum = produce_data(data_records,
	num_chunks,
	generators,
	batch_size,
	failed_on=failed_on,
	first_iter=first_iter,
	testing=testing,
	temp=temp)
	first_iter = False
	# t_dataload1 = time.time()
	# print(t_dataload1-t_dataload0)

	logging.info(print_gpu_utilization())

	# steps_per_epoch = dataset_lengths_sum // batch_size_pairs

	for epoch_num in range(num_epochs):
	logging.info("Performing Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))
	try:
	# t_fit0 = time.time()
	# Train the model
	model.fit(train_objectives=[(train_dataloader, train_loss)],
	evaluator=None,
	epochs=1,
	warmup_steps=warmup_steps,
	steps_per_epoch=steps_per_epoch,
	use_amp=use_amp,
	output_path=output_path)
	# t_fit1 = time.time()
	# print(t_fit1-t_fit0)

	steps += steps_per_epoch

	logging.info(print_gpu_utilization())
	logging.info("Succeeded on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))
	logging.info("{} Steps Completed in Total".format(steps))

	with open('train_logs.txt', 'a') as log:
	log.write("Succeeded on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps))

	except:
	logging.info("Failed on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1))

	with open('train_logs.txt', 'a') as log:
	log.write("Failed on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps))

	finally:
	warmup_steps = 0

	# Clear GPU/CUDA memory cache between data chunks
	train_dataloader = None
	model = None
	train_loss = None

	gc.collect()
	torch.cuda.empty_cache()

	# Reload the model and reinitialize the loss function
	model = construct_model(model_name='hum-lodestone-v1', max_seq_length=max_seq_length)

	train_loss = losses.MultipleNegativesRankingLoss(model,
	scale=20,
	similarity_fct=util.dot_score)

	logging.info(print_gpu_utilization())