akswelh
/

NEOX

Model card Files Files and versions Community

NEOX / megatron /training.py

akswelh

Upload 251 files

d90b3a8 verified 30 days ago

raw

history blame

64.4 kB

	# Copyright (c) 2024, EleutherAI
	# This file is based on code by the authors denoted below and has been modified from its original version.
	#
	# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# This file has been modified from its original version
	#

	"""Pretrain utilities."""
	from datetime import datetime
	from functools import partial
	from collections import defaultdict

	import math
	import sys
	from contextlib import nullcontext

	import torch
	import torch.nn.functional as F
	import deepspeed
	from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
	import numpy as np

	from megatron.utils import (
	Timers,
	init_wandb,
	get_ltor_masks_and_position_ids,
	reduce_losses,
	)

	from megatron import print_rank_0, mpu
	from megatron.model import (
	GPT2ModelPipe,
	SoftEmbedding,
	get_params_for_weight_decay_optimization,
	mark_norms_for_sequence_parallel_grad_sync,
	)
	from megatron.mpu.mappings import gather_from_model_parallel_region
	from megatron.checkpointing import load_checkpoint, save_checkpoint
	from megatron.data.data_utils import (
	build_train_valid_test_data_loaders,
	shift_and_wrap_data_loaders,
	)
	from megatron.initialize import initialize_megatron
	from megatron.learning_rates import AnnealingLR
	from megatron.logging import tb_wandb_log, training_log
	from megatron.utils import (
	OverflowMonitor,
	get_noise_scale_logger,
	get_total_params,
	CharCounter,
	)
	from megatron.model.gpt2_model import cross_entropy
	from megatron.mpu import vocab_parallel_cross_entropy

	from pickle import dump
	import os


	def mup_weights_reinit(neox_args, model):
	def has_method(o, name):
	return callable(getattr(o, name, None))

	for layer in model.modules():
	# This normally would happen in set_base_shapes if we actually were able to use the MuReadout class
	if hasattr(layer, "mup_rescale_parameters") and layer.mup_rescale_parameters:
	layer._rescale_parameters()

	if has_method(layer, "mup_reinitialize_weights"):
	layer.mup_reinitialize_weights(neox_args)


	def save_base_shapes(neox_args, base_shapes, use_cache):

	# Instantiation of the base model fails in the init function (init_functions.py) because we haven't called set_base_shapes on it at this point, so disable it temporarily here
	neox_args.use_mup = False

	base_model = GPT2ModelPipe(
	neox_args=neox_args,
	num_tokentypes=0,
	parallel_output=True if neox_args.train_impl != "rm" else False,
	topology=mpu.get_topology(),
	use_cache=use_cache,
	)

	if not neox_args.is_pipe_parallel:
	base_model = base_model.to_sequential()

	try:
	import mup
	except ModuleNotFoundError:
	print("Please install mup https://github.com/microsoft/mup")
	raise Exception

	base_shapes = mup.get_shapes(base_model)

	del base_model

	old_hidden_size = neox_args.hidden_size
	neox_args.hidden_size = neox_args.hidden_size * neox_args.mup_width_scale

	delta_model = GPT2ModelPipe(
	neox_args=neox_args,
	num_tokentypes=0,
	parallel_output=True if neox_args.train_impl != "rm" else False,
	topology=mpu.get_topology(),
	use_cache=use_cache,
	)

	if not neox_args.is_pipe_parallel:
	delta_model = delta_model.to_sequential()

	delta_shapes = mup.get_shapes(delta_model)

	# change back
	neox_args.use_mup = True
	neox_args.hidden_size = old_hidden_size

	save_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"
	print(f"saving base shapes at {save_shapes}")
	mup.make_base_shapes(base_shapes, delta_shapes, savefile=save_shapes)
	print(f"base shapes saved...exiting")
	sys.exit(1)


	def mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator):
	from megatron.mup_substitute import get_coord_data
	from mup.coord_check import plot_coord_data

	def lazy_model(hidden_size):
	def gen():
	old_hidden_size = neox_args.hidden_size
	neox_args.hidden_size = hidden_size

	model, optimizer, _, _ = setup_model_and_optimizer(
	neox_args=neox_args, use_cache=False
	)

	neox_args.hidden_size = old_hidden_size

	return model

	return gen

	models = {}

	# Hidden size needs to be divisible by num attention heads
	for hidden_size in (neox_args.num_attention_heads * (2**p) for p in range(2, 9)):
	models[hidden_size] = lazy_model(hidden_size)

	neox_args.use_mup = True
	df_up = get_coord_data(
	neox_args, timers, lr_scheduler, models, train_data_iterator, mup=True
	)
	neox_args.use_mup = False
	df_sp = get_coord_data(
	neox_args, timers, lr_scheduler, models, train_data_iterator, mup=False
	)

	plot_coord_data(df_up, save_to=f"coord_check_up.{torch.distributed.get_rank()}.jpg")
	plot_coord_data(df_sp, save_to=f"coord_check_sp.{torch.distributed.get_rank()}.jpg")

	print_rank_0("Saved coord check plots... exiting")
	sys.exit(1)


	def update_iterations(neox_args, data_loaders):
	"""
	Compute the number of train iterations if not specified and num_epochs, updates the neox_args object.
	Note that if len(train_dataloader) % gradient_accumulation_steps != 0, this will configure neox
	to do as many iterations as possible while ensuring that each example is seen at most train_epochs
	times.
	"""
	if (not neox_args.do_train) or (neox_args.train_iters is not None):
	pass
	elif neox_args.train_iters is None and neox_args.train_epochs is None:
	print_rank_0(
	"ERROR:Failed to specify either train_epochs or train_iters in config file"
	)
	else:
	global_rank = torch.distributed.get_rank()

	if global_rank == 0:
	train_dataloader = data_loaders["train"]
	train_epochs = neox_args.train_epochs
	gradient_accumulation_steps = neox_args.gradient_accumulation_steps

	train_dataloader_len = len(train_dataloader)
	train_iterations = (
	train_dataloader_len * train_epochs
	) // gradient_accumulation_steps

	train_iters_tensor = torch.cuda.LongTensor([train_iterations])
	else:
	train_iters_tensor = torch.cuda.LongTensor([0])

	torch.distributed.broadcast(train_iters_tensor, src=0)

	neox_args.train_iters = train_iters_tensor[0].item()

	print_rank_0(
	f"Training for a total of {neox_args.train_iters} iterations, corresponding to {neox_args.train_epochs} epochs."
	)


	def pretrain(neox_args):
	"""Main training program.

	This function will run the following in the order provided:
	1) initialize Megatron.
	2) get train/val/test datasets.
	3) setup model, optimizer and lr schedule.
	4) configure data loading
	5) train the model.

	Arguments:
	neox_args: an instance of NeoXArgs containing the configuration for pretrain

	"""
	# setup logging and timers
	init_wandb(neox_args=neox_args)
	timers = Timers(
	use_wandb=neox_args.use_wandb,
	tensorboard_writer=neox_args.tensorboard_writer,
	comet_experiment=neox_args.comet_experiment,
	)

	# Initialize and get arguments, timers, and Tensorboard writer.
	initialize_megatron(neox_args=neox_args)

	# Create data loaders
	timers("train/valid/test data loaders").start()
	data_loaders = build_train_valid_test_data_loaders(neox_args=neox_args)
	update_iterations(neox_args=neox_args, data_loaders=data_loaders)
	timers("train/valid/test data loaders").stop()

	# Model, optimizer, and learning rate.
	timers("model and optimizer").start()
	model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
	neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
	)
	timers("model and optimizer").stop()

	# Make and configure iterators
	timers("train/valid/test data iterators").start()
	(
	train_data_iterator,
	valid_data_iterator,
	test_data_iterator,
	) = shift_and_wrap_data_loaders(neox_args=neox_args, data_loaders=data_loaders)
	timers("train/valid/test data iterators").stop()

	if neox_args.use_mup and neox_args.coord_check:
	mup_coord_check(neox_args, timers, lr_scheduler, train_data_iterator)

	# Print setup timing.
	print_rank_0("done with setups ...")
	timers.log(
	[
	"train/valid/test data loaders",
	"model and optimizer",
	"train/valid/test data iterators",
	]
	)
	print_rank_0("training ...")

	iteration = neox_args.iteration
	# edge case: save step 0 checkpoint if requested and we're starting from step 0
	if (
	neox_args.save
	and neox_args.extra_save_iters
	and 0 in neox_args.extra_save_iters
	and iteration == 0
	):
	save_checkpoint(
	neox_args=neox_args,
	iteration=iteration,
	model=model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	)

	if neox_args.do_train and neox_args.train_iters > 0:
	iteration = train(
	neox_args=neox_args,
	timers=timers,
	model=model,
	reference_model=reference_model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	train_data_iterator=train_data_iterator,
	valid_data_iterator=valid_data_iterator,
	)

	if neox_args.do_valid:
	prefix = "the end of training for val data"
	evaluate_and_print_results(
	neox_args=neox_args,
	prefix=prefix,
	forward_step_func=forward_step,
	data_iterator=valid_data_iterator,
	model=model,
	iteration=iteration,
	verbose=False,
	timers=timers,
	reference_model=reference_model,
	)

	if neox_args.save and iteration != 0:
	save_checkpoint(
	neox_args=neox_args,
	iteration=iteration,
	model=model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	)

	if neox_args.do_test:
	# Run on test data.
	prefix = "the end of training for test data"
	evaluate_and_print_results(
	neox_args=neox_args,
	prefix=prefix,
	forward_step_func=forward_step,
	data_iterator=test_data_iterator,
	model=model,
	iteration=iteration,
	verbose=True,
	timers=timers,
	chart_name="test",
	reference_model=reference_model,
	)


	def _get_batch(neox_args, tokenizer, keys, data, datatype, label_mask_zero=False):
	"""Support function for get_batch / get_batch pipe (to avoid code repetition)"""
	data_b = mpu.broadcast_data(keys, data, datatype)
	token_key = keys[0]
	label_key = keys[1] if len(keys) > 1 else None
	# Unpack.
	tokens_ = data_b[token_key].long()
	if label_key in data_b:
	label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous()
	labels = torch.where(
	data_b[label_key].long() >= 0,
	data_b[label_key].long(),
	torch.zeros_like(data_b[label_key].long()),
	)[:, 1:].contiguous()
	else:
	label_mask = (tokens_.long() >= 0)[:, 1:].contiguous()
	labels = tokens_[:, 1:].contiguous()
	if label_mask_zero:
	labels = labels * label_mask
	tokens = tokens_[:, :-1].contiguous()

	# Get the masks and position ids.
	attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
	data=tokens,
	eod_token=neox_args.tokenizer.eod,
	eod_mask_loss=neox_args.eod_mask_loss,
	sliding_window_width=neox_args.sliding_window_width,
	)

	# combine loss masks from get_ltor_masks_and_position_ids with loss masks from data
	loss_mask = label_mask.to(loss_mask.dtype) * loss_mask
	return tokens, labels, loss_mask, attention_mask, position_ids


	def get_batch(neox_args, data_iterator):
	"""Generate a batch"""

	# Items and their type.
	if neox_args.train_impl in ["normal", "kto"]:
	keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
	elif neox_args.train_impl in ["dpo", "rm"]:
	keys = (
	[["pos", "pos_label"], ["neg", "neg_label"]]
	if neox_args.pos_train_label_data_paths
	else [["pos"], ["neg"]]
	)
	datatype = torch.int64

	# Broadcast data.
	if data_iterator is not None:
	data = next(data_iterator)
	else:
	data = None
	if neox_args.train_impl == "normal":
	return _get_batch(
	neox_args=neox_args,
	tokenizer=neox_args.tokenizer,
	keys=keys,
	data=data,
	datatype=datatype,
	)
	elif neox_args.train_impl == "kto":
	assert (
	neox_args.train_micro_batch_size_per_gpu > 1
	), "For KTO training, the train_micro_batch_size_per_gpu must be greater than 1."
	tup = _get_batch(
	neox_args=neox_args,
	tokenizer=neox_args.tokenizer,
	keys=keys,
	data=data,
	datatype=datatype,
	)
	# Remove the last token from the reward since we predict the next token, so
	# Reward of <current prediction> will be based on the label of <next token>
	rw_data = mpu.broadcast_data(["reward"], data, torch.float)["reward"][
	:, :-1
	].contiguous()
	ref_data = (
	mpu.broadcast_data(["ref"], data, torch.float)["ref"][:, :-1].contiguous()
	if neox_args.precompute_model_name
	else None
	)
	return tup + (rw_data, ref_data)
	elif neox_args.train_impl in ["dpo", "rm"]:
	pos_tup = _get_batch(
	neox_args=neox_args,
	tokenizer=neox_args.tokenizer,
	keys=keys[0],
	data=data,
	datatype=datatype,
	label_mask_zero=True,
	)
	neg_tup = _get_batch(
	neox_args=neox_args,
	tokenizer=neox_args.tokenizer,
	keys=keys[1],
	data=data,
	datatype=datatype,
	label_mask_zero=True,
	)
	if neox_args.precompute_model_name:
	ref_data = mpu.broadcast_data(["pos_ref", "neg_ref"], data, torch.float)
	else:
	ref_data = {"pos_ref": None}
	return [
	torch.cat((pos_item, neg_item), dim=0)
	for pos_item, neg_item in zip(pos_tup, neg_tup)
	] + [
	torch.cat((ref_data["pos_ref"], ref_data["neg_ref"]), dim=0)[
	:, :-1
	].contiguous()
	if ref_data["pos_ref"] is not None
	else None
	]


	def get_batch_pipe(data, neox_args, curr_scheduler=None):
	"""A modification of get_batch() to work with the latest batch instead of an iterator."""

	assert neox_args.train_impl not in [
	"kto",
	"dpo",
	"rm",
	], "Pipeline parallel is currently unsupported when using any of kto, dpo, rm. Set pipe_parallel_size to 0"

	# Items and their type.
	keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
	datatype = torch.int64

	tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(
	neox_args, neox_args.tokenizer, keys, data, datatype
	)
	if curr_scheduler is not None:
	# iteration + 1 to align with how/when DeepSpeed updates the buffers
	curriculum_seqlen = curr_scheduler.update_difficulty(neox_args.iteration + 1)
	if curriculum_seqlen < tokens.size()[1]:
	# seqlen-based curriculum learning
	# input_ids, position_ids, labels have size [batch size, seqlen]
	# input_ids = input_ids[:, :curriculum_seqlen].contiguous()
	tokens = tokens[:, :curriculum_seqlen].contiguous()
	position_ids = position_ids[:, :curriculum_seqlen].contiguous()
	if labels is not None:
	labels = labels[:, :curriculum_seqlen].contiguous()
	if loss_mask is not None:
	loss_mask = loss_mask[:, :curriculum_seqlen].contiguous()
	# attention_mask has size [1, 1, seqlen, seqlen]
	attention_mask = attention_mask[
	:, :, :curriculum_seqlen, :curriculum_seqlen
	].contiguous()

	# unpack data
	return (tokens, position_ids, attention_mask), (labels, loss_mask)


	def get_batch_sequential(forward_input, neox_args):
	"""A modification of get_batch() to work with the latest batch instead of an iterator."""
	attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
	data=forward_input[0],
	eod_token=neox_args.tokenizer.eod,
	eod_mask_loss=neox_args.eod_mask_loss,
	)
	return (forward_input[0], forward_input[1], attention_mask)


	def average_losses_across_data_parallel_group(losses):
	"""Reduce a tensor of losses across all GPUs."""
	averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses])
	torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group())
	averaged_losses = averaged_losses / torch.distributed.get_world_size(
	group=mpu.get_data_parallel_group()
	)

	return averaged_losses


	def mb_moe_loss_func(args, loss_mask, output_tensor=None):
	from megatron.model import megablocks_utils
	from megatron.model.megablocks_utils import moe

	# NOTE: For pipeline parallelism this function will be run on the
	# non-final stages to calculate load balancing loss contribution
	# for the MoE layers within the stage. For these cases, output_tensor
	# will be None.
	loss, loss_dict = (None, {})
	if False:
	assert output_tensor is not None
	loss, loss_dict = loss_func(loss_mask, output_tensor)
	assert loss.numel() == 1

	# NOTE: If recompute is enabled we will collect duplicate load
	# balancing loss contributions. Prune these before calculating
	# the load balancing loss.
	if args.checkpoint_activations:
	# Ignore load balancing loss contributions compute during
	# the forward pass if recompute is turned on.
	load_balancing_loss_data = moe.get_load_balancing_loss()
	if args.num_layers * 2 == len(load_balancing_loss_data):
	load_balancing_loss_data = load_balancing_loss_data[args.num_layers :]
	moe.clear_load_balancing_loss()
	for x in load_balancing_loss_data:
	moe.save_load_balancing_loss(x)

	# Compute the load balancing loss for all MoE layers.
	megablocks_args = args = megablocks_utils.as_megablocks_args(args)
	lbl = moe.batched_load_balancing_loss(megablocks_args)
	moe.clear_load_balancing_loss()

	# Average the load balancing loss across data parallel
	# replicas and save for logging.
	averaged_lbl = average_losses_across_data_parallel_group([lbl])
	loss_dict["load balancing loss"] = averaged_lbl[0]
	return averaged_lbl, loss_dict


	def get_logp(logits, labels, force_fp32=False):
	# Rather than reimplementing logp, cross entropy loss is actually logp, just inverted.
	if force_fp32:
	logits = logits.float()
	return -vocab_parallel_cross_entropy(logits, labels)


	def get_pos_neg_logp(logits, labels, force_fp32=False):
	# Rather than reimplementing logp, cross entropy loss is actually logp, just inverted.
	if force_fp32:
	logits = logits.float()
	return torch.chunk(-vocab_parallel_cross_entropy(logits, labels), 2, 0)


	def forward_step(
	data_iterator,
	model,
	neox_args,
	timers,
	return_logits=False,
	is_train=False,
	reference_model=None,
	):
	"""Forward step."""
	if neox_args.is_pipe_parallel:
	return model.eval_batch(data_iterator, return_logits=return_logits)

	# Get the batch.
	if neox_args.memory_profiling and neox_args.iteration:
	torch.cuda.nvtx.range_push(f"Get batch")
	if timers is not None:
	timers("batch generator").start()
	if neox_args.train_impl == "normal":
	tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
	neox_args=neox_args, data_iterator=data_iterator
	)
	elif neox_args.train_impl == "kto":
	(
	tokens,
	labels,
	loss_mask,
	attention_mask,
	position_ids,
	rewards,
	ref_logp,
	) = get_batch(neox_args=neox_args, data_iterator=data_iterator)
	if neox_args.train_impl in ["dpo", "rm"]:
	tokens, labels, loss_mask, attention_mask, position_ids, ref_logp = get_batch(
	neox_args=neox_args, data_iterator=data_iterator
	)

	if timers is not None:
	timers("batch generator").stop()
	if neox_args.memory_profiling:
	torch.cuda.nvtx.range_pop()

	if neox_args.memory_profiling:
	torch.cuda.nvtx.range_push(f"Forward pass")
	metrics = {}
	if neox_args.train_impl == "normal":
	# Sequential returns moe_losses, but this is not yet supported by pipe parallel
	maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
	if type(maybe_tuple) is tuple:
	outputs, moe_losses = maybe_tuple
	else:
	outputs = maybe_tuple
	moe_losses = []
	if (
	is_train
	and neox_args.curriculum_learning
	and neox_args.curriculum_seqlen < neox_args.seq_length
	):
	loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
	labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
	main_loss = cross_entropy(
	outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
	)
	if neox_args.moe_num_experts > 1:
	if neox_args.moe_type == "deepspeed":
	moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses)
	elif neox_args.moe_type == "megablocks":
	moe_loss = mb_moe_loss_func(neox_args, loss_mask, outputs)[0]
	else:
	raise ValueError(f"Unsupported moe_type: {neox_args.moe_type}")
	else:
	moe_loss = 0.0
	loss = main_loss + moe_loss
	elif neox_args.train_impl == "rm":
	maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
	if type(maybe_tuple) is tuple:
	outputs, _ = maybe_tuple
	else:
	outputs = maybe_tuple
	pos, neg = torch.chunk(outputs, 2, 0)
	pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0)
	# We assume that each pos, neg pair occur in the same order
	# e.g. second nonzero pos is the corresponding second nonzero neg
	# and that there are also an equal number of pos and neg in each sequence.
	pos_indx = pos_loss_mask.nonzero()
	neg_indx = neg_loss_mask.nonzero()
	# indx[:, 0] is the batch index, indx[:, 1] is the token index, we only care about the token index.
	pos_indx = pos_indx[:, 1].unsqueeze(1)
	neg_indx = neg_indx[:, 1].unsqueeze(1)
	pos = torch.gather(pos.squeeze(), dim=1, index=pos_indx)
	neg = torch.gather(neg.squeeze(), dim=1, index=neg_indx)
	with torch.no_grad():
	metrics["pos_values"] = pos.clone().detach().mean()
	metrics["neg_values"] = neg.clone().detach().mean()
	metrics["margin"] = (pos - neg).clone().detach().mean()
	metrics["accuracy"] = ((pos - neg) > 0).clone().detach().float().mean()
	loss = (-F.logsigmoid(pos - neg).mean()) + (
	(neox_args.z_loss * (pos2 + neg2)).mean()
	)
	elif neox_args.train_impl == "dpo":
	# Based on https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
	with torch.inference_mode():
	# So we can gather token logps...
	token_logp_labels = labels.clone()
	pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0)
	if neox_args.dpo_reference_free:
	ref_pos = 0
	ref_neg = 0
	elif ref_logp is None:
	ref_maybe_tuple = reference_model(
	(tokens, position_ids, attention_mask), neox_args=neox_args
	)
	if type(ref_maybe_tuple) is tuple:
	# We should ignore MoE losses yeah?
	ref_outputs, _ = ref_maybe_tuple
	else:
	ref_outputs = ref_maybe_tuple
	ref_pos, ref_neg = get_pos_neg_logp(
	ref_outputs, token_logp_labels, neox_args.dpo_fp32
	)
	else:
	ref_pos, ref_neg = torch.chunk(ref_logp, 2, 0)
	ref_pos = (ref_pos * pos_loss_mask).sum(-1)
	ref_neg = (ref_neg * neg_loss_mask).sum(-1)
	chosen_maybe_tuple = model(
	(tokens, position_ids, attention_mask), neox_args=neox_args
	)
	if type(chosen_maybe_tuple) is tuple:
	# We should ignore MoE losses yeah?
	chosen_outputs, _ = chosen_maybe_tuple
	else:
	chosen_outputs = chosen_maybe_tuple
	chosen_pos, chosen_neg = get_pos_neg_logp(
	chosen_outputs, token_logp_labels, neox_args.dpo_fp32
	)
	chosen_pos = (chosen_pos * pos_loss_mask).sum(-1)
	chosen_neg = (chosen_neg * neg_loss_mask).sum(-1)
	with torch.no_grad():
	# Collect metrics...
	if not neox_args.dpo_reference_free:
	metrics["ref_neg"] = ref_neg.clone().detach().mean()
	metrics["ref_pos"] = ref_pos.clone().detach().mean()
	metrics["chosen_neg"] = chosen_neg.clone().detach().mean()
	metrics["chosen_pos"] = chosen_pos.clone().detach().mean()
	if not neox_args.dpo_reference_free:
	chosen_rewards = neox_args.dpo_beta * (
	chosen_pos.clone().detach() - ref_pos.clone().detach()
	)
	rejected_rewards = neox_args.dpo_beta * (
	chosen_neg.clone().detach() - ref_neg.clone().detach()
	)
	metrics["chosen_rewards"] = chosen_rewards.mean()
	metrics["rejected_rewards"] = rejected_rewards.mean()
	reward_acc = (chosen_rewards > rejected_rewards).float()
	metrics["reward_acc"] = reward_acc.mean()
	metrics["margins"] = (chosen_rewards - rejected_rewards).mean()
	pi_logrations = chosen_pos - chosen_neg
	ref_logrations = ref_pos - ref_neg
	logits = pi_logrations - ref_logrations
	loss = -F.logsigmoid(neox_args.dpo_beta * logits).mean()
	elif neox_args.train_impl == "kto":
	# Based on https://github.com/huggingface/trl/blob/main/trl/trainer/kto_trainer.py
	# Except we don't have an extra input for KL logp, we just split the batch in half
	with torch.no_grad():
	# So we can gather token logps...
	token_logp_labels = labels.clone()
	token_logp_labels[token_logp_labels == -100] = 0
	if ref_logp is None:
	# Did not precompute logits....
	ref_maybe_tuple = reference_model(
	(tokens, position_ids, attention_mask), neox_args=neox_args
	)
	if type(ref_maybe_tuple) is tuple:
	# We should ignore MoE losses yeah?
	ref_outputs, _ = ref_maybe_tuple
	else:
	ref_outputs = ref_maybe_tuple
	# gather across tensor parallel group
	ref_outputs = gather_from_model_parallel_region(ref_outputs)

	ref_logp = get_logp(ref_outputs, token_logp_labels, neox_args.kto_fp32)
	else:
	print(f"REF LOGP: {ref_logp.clone().detach().mean()}")
	ref_logp = ref_logp * loss_mask
	scaling = (rewards.sum(-1) > 0.001).float() * neox_args.kto_desirable_weight
	scaling += (
	rewards.sum(-1) < -0.001
	).float() * neox_args.kto_undesirable_weight
	pos_mask = (rewards > 0.001).float()
	neg_mask = (rewards < -0.001).float()
	chosen_maybe_tuple = model(
	(tokens, position_ids, attention_mask), neox_args=neox_args
	)
	if type(chosen_maybe_tuple) is tuple:
	# We should ignore MoE losses yeah?
	chosen_outputs, _ = chosen_maybe_tuple
	else:
	chosen_outputs = chosen_maybe_tuple
	chosen_outputs = gather_from_model_parallel_region(chosen_outputs)
	chosen_logp = get_logp(chosen_outputs, token_logp_labels, neox_args.kto_fp32)
	chosen_logp = chosen_logp * loss_mask
	with torch.no_grad():
	# Collect metrics...
	metrics["ref_logp"] = ref_logp.clone().detach().sum(-1).mean()
	metrics["policy_logp"] = chosen_logp.clone().detach().sum(-1).mean()
	metrics["pos_ref_logp"] = (
	(ref_logp * pos_mask).clone().detach().sum(-1).mean()
	)
	metrics["neg_ref_logp"] = (
	(ref_logp * neg_mask).clone().detach().sum(-1).mean()
	)
	metrics["pos_policy_logp"] = (
	(chosen_logp * pos_mask).clone().detach().sum(-1).mean()
	)
	metrics["neg_policy_logp"] = (
	(chosen_logp * neg_mask).clone().detach().sum(-1).mean()
	)
	metrics["kl"] = (
	chosen_logp.clone().detach() - ref_logp.clone().detach()
	).sum() / loss_mask.sum()
	policy_rewards = (
	neox_args.kto_beta
	* rewards
	* (chosen_logp.clone().detach() - ref_logp.clone().detach())
	)
	reward_acc = (policy_rewards.sum(-1) > 0.0).float()
	metrics["reward_acc"] = reward_acc.mean()
	metrics["policy_rewards"] = policy_rewards.sum()
	print(metrics)
	pol_logp1, pol_logp2 = torch.chunk(chosen_logp, 2, 0)
	ref_logp1, ref_logp2 = torch.chunk(ref_logp, 2, 0)
	reward1, reward2 = torch.chunk(rewards, 2, 0)
	scaling1, scaling2 = torch.chunk(scaling, 2, 0)
	kl1 = torch.clamp((pol_logp1 - ref_logp1).sum(-1), min=0).mean()
	kl2 = torch.clamp((pol_logp2 - ref_logp2).sum(-1), min=0).mean()
	log_ratio1 = pol_logp1 - ref_logp1
	log_ratio2 = pol_logp2 - ref_logp2

	# TODO: Add pack_until_overflow sequence support
	loss = (
	0.5
	* scaling1.mean(-1)
	* (
	1
	- F.sigmoid(
	(
	neox_args.kto_beta
	* reward1.mean(-1)
	* (log_ratio1.sum(-1) - kl2.clone().detach())
	)
	)
	)
	) + (
	0.5
	* scaling2.mean(-1)
	* (
	1
	- F.sigmoid(
	(
	neox_args.kto_beta
	* reward2.mean(-1)
	* (log_ratio2.sum(-1) - kl1.clone().detach())
	)
	)
	)
	)
	# print(loss.shape)
	loss = loss.mean()
	# print(loss.shape)
	if neox_args.memory_profiling:
	torch.cuda.nvtx.range_pop()
	if return_logits:
	return loss, outputs, metrics
	return loss, metrics


	def get_model(neox_args, use_cache=False):
	"""Build the model."""

	# Build model on cpu.
	print_rank_0("building GPT2 model ...")

	# Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below.
	# If mup isn't being used anyways, this has no effect.
	old_use_mup = neox_args.use_mup
	neox_args.use_mup = False

	if neox_args.zero_stage in [2, 3]:
	if neox_args.pipe_parallel_size == 1:
	print_rank_0(
	"ZeRO stage 2/3 and the PipelineModule are incompatible, please set 'pipe_parallel_size' to 0 instead"
	)
	exit()
	if neox_args.pipe_parallel_size > 1:
	print_rank_0(
	"ZeRO stage 2/3 and pipeline paralleism are not supported simultaneously"
	)
	exit()
	if neox_args.model_parallel_size > 1:
	print_rank_0(
	"ZeRO stage 2/3 and model paralleism are not currently supported simultaneously"
	)
	exit()

	with deepspeed.zero.Init(
	config_dict_or_path=neox_args.deepspeed_config
	) if neox_args.zero_stage == 3 else nullcontext() as gs:
	model = GPT2ModelPipe(
	neox_args=neox_args,
	num_tokentypes=0,
	parallel_output=True if neox_args.train_impl != "rm" else False,
	topology=mpu.get_topology(),
	use_cache=use_cache,
	)

	### soft prompt tuning stuff ###
	if neox_args.soft_prompt_tuning is not None and neox_args.soft_prompt_tuning.get(
	"enabled", False
	):
	soft_prompt = SoftEmbedding(
	neox_args,
	wte=getattr(model, "0").word_embeddings,
	n_tokens=neox_args.soft_prompt_tuning.get("n_tokens", 10),
	init_string=neox_args.soft_prompt_tuning.get("init_string", ""),
	init_range=neox_args.soft_prompt_tuning.get("init_range", 0.5),
	)
	model.insert_layers(
	layers=soft_prompt, idx=1
	) # insert the soft prompt layer directly after the word embeddings

	# freeze everything but the soft prompt
	for name, param in model.named_parameters():
	if not "soft_embedding" in name:
	param.requires_grad = False

	if not neox_args.is_pipe_parallel:
	# Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
	model = model.to_sequential()

	neox_args.use_mup = old_use_mup

	if neox_args.use_mup:
	try:
	import mup
	except ModuleNotFoundError:
	print("Please install mup https://github.com/microsoft/mup")
	raise Exception

	base_shapes = f"{neox_args.base_shapes_file}.{torch.distributed.get_rank()}"

	if neox_args.save_base_shapes:
	save_base_shapes(neox_args, base_shapes, use_cache)

	mup.set_base_shapes(model, base_shapes)

	# Call the mup replacement init functions on the model now that set_base_shapes has given each weight a .infshape attribute
	mup_weights_reinit(neox_args, model)

	if neox_args.deepspeed:
	# DeepSpeed handles CUDA, FP16, and DDP components.
	return model
	else:
	raise ValueError("Must be using deepspeed to run neox")


	def get_optimizer(model, neox_args, dummy=False):
	"""Set up the optimizer."""
	if neox_args.no_load_optim and neox_args.deepspeed:
	# Required to have something so...
	dummy = True
	neox_args.optimizer = {"params": {"lr": 0.0}}
	neox_args.optimizer_type = "adam"
	elif neox_args.no_load_optim:
	return None, None

	if neox_args.optimizer is None:
	print_rank_0(
	f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
	)
	exit()
	# Build parameter groups (weight decay and non-decay).
	param_groups = get_params_for_weight_decay_optimization(model, neox_args)
	print_rank_0(
	f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}'
	)

	if neox_args.create_moe_param_group:
	from deepspeed.moe.utils import (
	is_moe_param,
	split_params_into_different_moe_groups_for_optimizer,
	)

	param_groups = split_params_into_different_moe_groups_for_optimizer(
	param_groups
	)

	# Add model parallel attribute if it is not set.
	for param_group in param_groups:
	for param in param_group["params"]:
	if not hasattr(param, "model_parallel"):
	param.model_parallel = False

	# Filter out params that don't require a grad (for soft prompt tuning, etc.)
	_param_groups = []
	for param_group in param_groups:
	trainable_params = [p for p in param_group["params"] if p.requires_grad]
	if dummy:
	trainable_params = [trainable_params[0]] # just take the first one
	param_group["params"] = trainable_params
	_param_groups.append(param_group)
	if dummy:
	# Only need one.
	break
	param_groups = _param_groups

	# If we're using mup, then the optimizer must be adam or sgd
	assert not neox_args.use_mup or (
	neox_args.optimizer_type.lower() == "adam"
	or neox_args.optimizer_type.lower() == "sgd"
	), f"If use_mup == True, you must specify either the adam or sgd optimizers. You passed: {neox_args.optimizer_type.lower()}"

	if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]:
	if neox_args.optimizer == "cpu_torch_adam":
	cpu_adam_optimizer = torch.optim.Adam
	else:
	from deepspeed.ops.adam import DeepSpeedCPUAdam

	cpu_adam_optimizer = DeepSpeedCPUAdam
	optimizer = cpu_adam_optimizer(
	param_groups,
	weight_decay=neox_args.weight_decay,
	**neox_args.optimizer["params"],
	)
	elif neox_args.optimizer_type.lower() == "onebitadam":
	assert neox_args.deepspeed
	optimizer = None
	# onebitadam needs to be instantiated within the deepspeed engine to work :\|
	elif neox_args.optimizer_type.lower() == "sm3":
	from .optimizers import SM3

	optimizer = SM3(param_groups, **neox_args.optimizer["params"])
	elif neox_args.optimizer_type.lower() == "madgrad_wd":
	from .optimizers import madgrad_wd

	optimizer = madgrad_wd(
	param_groups,
	weight_decay=neox_args.weight_decay,
	**neox_args.optimizer["params"],
	)
	elif neox_args.optimizer_type.lower() == "lion":
	# if we want the deepspeed zero lion...megatron lion will throw DeepSpeed Error
	if neox_args.zero_optimization["stage"] != 0:
	from deepspeed.ops.lion import FusedLion

	lion_optimizer = FusedLion
	# if not zero
	else:
	from .optimizers import Lion

	lion_optimizer = Lion

	optimizer = lion_optimizer(
	param_groups,
	weight_decay=neox_args.weight_decay,
	**neox_args.optimizer["params"],
	)
	elif neox_args.optimizer_type.lower() == "adam":
	# Use Adam
	if neox_args.use_mup:
	try:
	from mup import MuAdam

	adam_optimizer = MuAdam
	except ModuleNotFoundError:
	print("Please install mup https://github.com/microsoft/mup")
	raise Exception
	else:
	if neox_args.use_bnb_optimizer:
	try:
	import bitsandbytes as bnb

	adam_optimizer = bnb.optim.Adam8bit
	except ModuleNotFoundError:
	print(
	"Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
	)
	raise Exception
	else:
	try:
	# default to apex as it's slightly faster
	from apex.optimizers import FusedAdam as Adam
	except ImportError:
	# if apex isn't installed, use deepspeed's FusedAdam
	print(
	"WARNING: APEX not installed - defaulting to deepspeed's fused adam"
	)
	from deepspeed.ops.adam import FusedAdam as Adam
	adam_optimizer = Adam
	optimizer = adam_optimizer(
	param_groups,
	weight_decay=neox_args.weight_decay,
	**neox_args.optimizer["params"],
	)
	elif neox_args.optimizer_type.lower() == "sgd":
	try:
	from mup import MuSGD
	except ModuleNotFoundError:
	print("Please install mup https://github.com/microsoft/mup")
	raise Exception
	optimizer = MuSGD(
	param_groups,
	weight_decay=neox_args.weight_decay,
	**neox_args.optimizer["params"],
	)
	else:
	raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized")

	if neox_args.deepspeed:
	# fp16 wrapper is not required for DeepSpeed.
	return optimizer, param_groups
	else:
	raise ValueError("Must be using deepspeed to run neox")


	def get_learning_rate_scheduler(optimizer, neox_args):
	"""Build the learning rate scheduler."""
	if (neox_args.no_load_optim) and not neox_args.deepspeed:
	# TODO: this should be configured as a separate arg
	return None
	if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam":
	print_rank_0(
	"WARNING: onebitadam requires the lr scheduler be built by deepspeed - "
	"Make sure one is added to your deepspeed config"
	)
	return None

	# Add linear learning rate scheduler.
	if neox_args.lr_decay_iters is not None:
	num_iters = neox_args.lr_decay_iters
	elif neox_args.lr_decay_fraction is not None:
	num_iters = math.floor(neox_args.train_iters * neox_args.lr_decay_fraction)
	else:
	num_iters = neox_args.train_iters
	num_iters = max(1, num_iters)
	init_step = 0
	warmup_iter = neox_args.warmup * num_iters
	lr_scheduler = AnnealingLR(
	optimizer,
	start_lr=neox_args.lr,
	warmup_iter=warmup_iter,
	total_iters=num_iters,
	decay_style=neox_args.lr_decay_style,
	last_iter=init_step,
	min_lr=neox_args.min_lr,
	use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
	override_lr_scheduler=neox_args.override_lr_scheduler,
	use_mup=neox_args.use_mup,
	)

	return lr_scheduler


	def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
	"""Setup memory profiler"""
	if neox_args.memory_profiling:
	torch.cuda.memory._record_memory_history(
	True,
	# keep a maximum 100,000 alloc/free events from before the snapshot
	trace_alloc_max_entries=100000,
	trace_alloc_record_context=True,
	)

	"""Setup model and optimizer."""
	needs_reference_model = (
	(neox_args.train_impl == "dpo")
	and (neox_args.precompute_model_name is None)
	and (not neox_args.dpo_reference_free)
	) or ((neox_args.train_impl == "kto") and (neox_args.precompute_model_name is None))
	model = get_model(neox_args=neox_args, use_cache=use_cache)
	if needs_reference_model:
	reference_model = get_model(neox_args=neox_args, use_cache=use_cache)
	else:
	reference_model = None
	optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
	lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
	if neox_args.deepspeed and needs_reference_model:
	# Need an optimizer & lr_scheduler so make a very small one to keep deepspeed happy...
	ref_optimizer, ref_param_groups = get_optimizer(
	model=reference_model, neox_args=neox_args, dummy=True
	)
	ref_lr_scheduler = get_learning_rate_scheduler(
	optimizer=ref_optimizer, neox_args=neox_args
	)
	else:
	ref_optimizer, ref_param_groups, ref_lr_scheduler = None, None, None
	if neox_args.deepspeed:
	print_rank_0("DeepSpeed is enabled.")
	_model_params = param_groups if optimizer is None else None
	_lr_scheduler = lr_scheduler

	model, optimizer, _, lr_scheduler = deepspeed.initialize(
	model=model,
	optimizer=optimizer,
	args=neox_args,
	lr_scheduler=_lr_scheduler,
	dist_init_required=False,
	model_parameters=_model_params,
	# Need to remove the below so that it doesn't conflict with --deepspeed_config required by autotuning
	# config_params=neox_args.deepspeed_config,
	mpu=mpu if not neox_args.is_pipe_parallel else None,
	)
	if needs_reference_model:
	reference_model, _, _, _ = deepspeed.initialize(
	model=reference_model,
	optimizer=ref_optimizer,
	args=neox_args,
	lr_scheduler=ref_lr_scheduler,
	dist_init_required=False,
	model_parameters=ref_param_groups,
	mpu=mpu if not neox_args.is_pipe_parallel else None,
	)
	mark_norms_for_sequence_parallel_grad_sync(model, neox_args)
	if neox_args.moe_num_experts > 1 and neox_args.moe_type == "megablocks":
	# We need to additionally set this flag to ensure DS parallelism properly handles this foreign MoE.
	model.has_moe_layers = True
	model.total_params = get_total_params(model.module)
	print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')

	if neox_args.is_pipe_parallel:
	model.set_has_attention_mask(True)
	if neox_args.curriculum_learning:
	curr_scheduler = CurriculumScheduler(neox_args.curriculum_learning)
	if iteration is not None and iteration > 0:
	curr_scheduler.update_difficulty(iteration)
	else:
	curr_scheduler = None
	model.set_batch_fn(
	partial(
	get_batch_pipe, neox_args=neox_args, curr_scheduler=curr_scheduler
	)
	)
	else:
	model.module.set_batch_fn(
	partial(get_batch_sequential, neox_args=neox_args)
	)

	else:
	raise ValueError("Must be using deepspeed to run neox")

	if neox_args.load is not None:
	neox_args.iteration = load_checkpoint(
	neox_args=neox_args,
	model=model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	iteration=iteration,
	)
	if needs_reference_model:
	_ = load_checkpoint(
	neox_args=neox_args,
	model=reference_model,
	optimizer=ref_optimizer,
	lr_scheduler=ref_lr_scheduler,
	iteration=iteration,
	)
	reference_model.eval()
	print_rank_0(
	f"Loading checkpoint and starting from iteration {neox_args.iteration}"
	)
	else:
	neox_args.iteration = 0

	# need this for correct lr scheduling resume from ckpt
	# but it will not exist if this is being called for inference
	if lr_scheduler is not None:
	lr_scheduler.optimizer = model.optimizer

	return model, optimizer, lr_scheduler, reference_model


	def backward_step(neox_args, timers, optimizer, model, loss):
	"""Backward step."""

	# Backward pass.
	timers("backward-backward").start()
	if neox_args.deepspeed:
	model.backward(loss)
	else:
	raise ValueError("Must be using deepspeed to run neox")
	timers("backward-backward").stop()

	if neox_args.deepspeed:
	# DeepSpeed backward propagation already addressed all reduce communication.
	# Reset the timer to avoid breaking timer logs below.
	timers("backward-allreduce").reset()
	else:
	raise ValueError("Must be using deepspeed to run neox")


	def train_step(
	neox_args,
	timers,
	data_iterator,
	model,
	optimizer,
	lr_scheduler,
	reference_model=None,
	):
	"""Single training step."""

	# Pipeline parallelism schedules forward/backward/step
	if neox_args.is_pipe_parallel:
	reduced_loss = train_step_pipe(
	neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
	)
	reduce_metrics = reduced_loss
	if (
	neox_args.memory_profiling
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	and torch.distributed.get_rank() == 0
	):
	save_snapshot(neox_args)
	else:
	losses = []
	metric_dicts = defaultdict(list)
	for _ in range(neox_args.gradient_accumulation_steps):
	# Forward model for one step.
	timers("forward").start()
	loss, metric_dict = forward_step(
	neox_args=neox_args,
	timers=timers,
	data_iterator=data_iterator,
	model=model,
	is_train=True,
	reference_model=reference_model,
	)
	timers("forward").stop()
	losses.append(loss)
	for key in metric_dict.keys():
	metric_dicts[key].append(metric_dict[key])
	# Calculate gradients, reduce across processes, and clip.
	if (
	neox_args.profile
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	):
	torch.cuda.nvtx.range_push(f"Backward pass")
	timers("backward").start()
	backward_step(
	neox_args=neox_args,
	timers=timers,
	optimizer=optimizer,
	model=model,
	loss=loss,
	)
	timers("backward").stop()
	if (
	neox_args.profile
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	):
	torch.cuda.nvtx.range_pop()
	# Update parameters.
	if (
	neox_args.profile
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	):
	torch.cuda.nvtx.range_push(f"Optimizer step")

	timers("optimizer").start()
	if neox_args.deepspeed:
	model.step()
	else:
	raise ValueError("Must be using deepspeed to run neox")
	timers("optimizer").stop()
	if (
	neox_args.profile
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	):
	torch.cuda.nvtx.range_pop()
	if (
	neox_args.profile
	and neox_args.iteration >= neox_args.profile_step_start
	and neox_args.iteration <= neox_args.profile_step_stop
	and torch.distributed.get_rank() == 0
	):
	save_snapshot(neox_args)
	# reduces metrics across machines for logging
	reduce_metrics = {
	key: reduce_losses(metric_dicts[key]).mean() for key in metric_dicts.keys()
	}
	reduce_metrics["lm_loss"] = reduce_losses(losses).mean()

	if neox_args.precision == "fp16" and model.optimizer.overflow:
	skipped_iter = 1
	else:
	skipped_iter = 0

	collect_loss_for_unit_test(reduce_metrics["lm_loss"])
	return reduce_metrics, skipped_iter


	def train_step_pipe(neox_args, timers, model, data_iterator):
	"""Single training step with DeepSpeed's pipeline parallel engine."""

	assert neox_args.deepspeed
	loss = model.train_batch(data_iter=data_iterator)
	loss_dict = {"lm_loss": loss}
	# Don't break Megatron's timers because we changed code paths.
	for t in [
	"forward",
	"backward",
	"allreduce",
	"optimizer",
	"batch generator",
	"data loader",
	]:
	timers(t).reset()
	return loss_dict


	def is_save_iter(neox_args, iteration):
	if neox_args.extra_save_iters and iteration in neox_args.extra_save_iters:
	return True

	if neox_args.checkpoint_factor:
	if neox_args.checkpoint_scale == "linear":
	assert float(
	neox_args.checkpoint_factor
	).is_integer(), "checkpoint_factor must be a whole number when using linear checkpoint_scale"
	return iteration % neox_args.checkpoint_factor == 0
	elif neox_args.checkpoint_scale == "log":
	# Check if iteration is a power of checkpoint_factor
	assert neox_args.checkpoint_factor > 1
	power = 1
	while power < iteration + 1:
	if int(power) == iteration:
	return True
	power *= neox_args.checkpoint_factor
	return False

	return False


	def train(
	neox_args,
	timers,
	model,
	reference_model,
	optimizer,
	lr_scheduler,
	train_data_iterator,
	valid_data_iterator,
	):
	"""Train the model function."""

	# Turn on training mode which enables dropout.
	model.train()

	# Tracking loss.
	total_loss_dict = {}

	# Iterations.
	iteration = neox_args.iteration

	timers("interval time").start()
	report_memory_flag = True

	# get noise scale logger (if neox_args.log_gradient_noise_scale is True)
	noise_scale_logger = get_noise_scale_logger(neox_args)

	# to monitor if we've skipped many iterations in a row and trigger an early exit
	overflow_monitor = OverflowMonitor(optimizer)

	if neox_args.profile:
	schedule = torch.profiler.schedule(
	wait=neox_args.profile_step_start,
	warmup=1,
	active=neox_args.profile_step_stop - neox_args.profile_step_start,
	)
	prof = torch.profiler.profile(
	schedule=schedule,
	on_trace_ready=torch.profiler.tensorboard_trace_handler(
	neox_args.tensorboard_dir
	),
	record_shapes=True,
	profile_memory=True,
	with_flops=True,
	with_modules=True,
	with_stack=True,
	)
	prof.start()
	while iteration < neox_args.train_iters:
	if neox_args.profile:
	prof.step()
	if neox_args.profile and iteration == neox_args.profile_step_start:
	torch.cuda.cudart().cudaProfilerStart()
	loss_dict, skipped_iter = train_step(
	neox_args=neox_args,
	timers=timers,
	data_iterator=train_data_iterator,
	model=model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	reference_model=reference_model,
	)
	if neox_args.profile and iteration == neox_args.profile_step_stop:
	torch.cuda.cudart().cudaProfilerStop()
	prof.stop()
	iteration += 1
	neox_args.iteration = iteration
	if neox_args.precision == "fp16":
	overflow_monitor.check(skipped_iter) # check for repeated overflow
	if neox_args.log_gradient_noise_scale: # log noise scale if applicable
	noise_scale_logger.update()

	# get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
	# may have no tunable parameters on a specific rank
	if optimizer.param_groups:
	lr = optimizer.param_groups[0].get("lr", 0)
	else:
	lr = 0

	# Logging.
	report_memory_flag = training_log(
	neox_args=neox_args,
	timers=timers,
	loss_dict=loss_dict,
	total_loss_dict=total_loss_dict,
	learning_rate=lr,
	iteration=iteration,
	loss_scale=optimizer.cur_scale if neox_args.precision == "fp16" else None,
	report_memory_flag=report_memory_flag,
	skipped_iter=skipped_iter,
	model=model,
	optimizer=optimizer,
	noise_scale_logger=noise_scale_logger,
	)

	# Checkpointing
	if neox_args.save and is_save_iter(neox_args, iteration):
	save_checkpoint(
	neox_args=neox_args,
	iteration=iteration,
	model=model,
	optimizer=optimizer,
	lr_scheduler=lr_scheduler,
	)
	# Evaluation
	if (
	neox_args.eval_interval
	and iteration % neox_args.eval_interval == 0
	and neox_args.do_valid
	):
	prefix = "iteration {}".format(iteration)
	evaluate_and_print_results(
	neox_args=neox_args,
	prefix=prefix,
	forward_step_func=forward_step,
	data_iterator=valid_data_iterator,
	model=model,
	iteration=iteration,
	verbose=False,
	timers=timers,
	reference_model=reference_model,
	)

	if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
	torch.distributed.barrier()
	time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	rank = torch.distributed.get_rank()
	print_rank_0(
	"rank: {} \| time: {} \| exiting the program at iteration {}".format(
	rank, time_str, iteration
	)
	)
	sys.exit()

	return iteration


	def evaluate(
	neox_args,
	forward_step_fn,
	data_iterator,
	model,
	verbose=False,
	timers=None,
	reference_model=None,
	):
	"""Evaluation.
	neox_args: NeoX Arguments
	forward_step_fn: function with args `neox_args, timers,
	data_iterator & model that will run a forward pass on the model
	data_iterator: Iterator that iterates over batches of data. Should return data in the form:
	{'text': np.array([tokens], dtype=np.int64)}
	where the size of the array is the model's context size + 1
	(`get_batch` transforms it into inputs / labels)
	"""
	# Turn on evaluation mode which disables dropout.
	model.eval()
	losses = []
	metric_dicts = defaultdict(list)
	if neox_args.char_level_ppl:
	data_iterator = CharCounter(data_iterator, neox_args.tokenizer)

	with torch.no_grad():
	iteration = 0
	while iteration < neox_args.eval_iters:
	iteration += 1
	if verbose and iteration % neox_args.log_interval == 0:
	print_rank_0(
	"Evaluating iter {}/{}".format(iteration, neox_args.eval_iters)
	)

	# although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
	# to be consistent with deepspeed's pipe parallel engine
	# since pipe parallel already takes gradient_accumulation_steps into account - default to 1 here if pipe parallel is true
	for _ in range(
	1
	if neox_args.is_pipe_parallel
	else neox_args.gradient_accumulation_steps
	):
	# Forward evaluation
	loss, metric_dict = forward_step_fn(
	model=model,
	data_iterator=data_iterator,
	neox_args=neox_args,
	timers=timers,
	reference_model=reference_model,
	)
	losses.append(loss)
	for key in metric_dict.keys():
	metric_dicts[key].append(metric_dict[key])
	# When contiguous memory optimizations are enabled, the buffers
	# allocated by the optimizations are deallocated during backward pass
	# in the absence of backward pass the buffers should be reset after each
	# forward pass
	if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
	deepspeed.checkpointing.reset()

	# reduces losses across processes for logging & run eval harness tasks
	eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
	for key in metric_dicts.keys():
	eval_results[key] = reduce_losses(metric_dicts[key]).mean().item()
	eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])

	if neox_args.char_level_ppl:
	# calculate character level perplexity, if specified
	# if neox_args.char_level_ppl:
	# unwrap the data_iterator
	tokens_per_char = data_iterator.tokens_per_char()
	print_rank_0(f"Counting chars took {data_iterator.total_time} seconds")

	data_iterator = data_iterator.data_iterator
	eval_results["lm_loss_char_lvl_ppl"] = math.exp(
	eval_results["lm_loss"] * tokens_per_char
	)

	if neox_args.eval_tasks:
	from eval_tasks import run_eval_harness

	eval_results.update(
	run_eval_harness(
	model, forward_step_fn, neox_args, eval_tasks=neox_args.eval_tasks
	).get("results")
	)
	# Move model back to the train mode.
	model.train()
	return eval_results


	def collect_loss_for_unit_test(lm_ss):
	# Logic moved to separate function to allow tracking in unit tests with unittest.mock.patch
	pass


	def evaluate_and_print_results(
	neox_args,
	prefix,
	forward_step_func,
	data_iterator,
	model,
	iteration,
	verbose=False,
	timers=None,
	chart_name="validation",
	reference_model=None,
	):
	"""Helper function to evaluate and dump results on screen."""
	total_loss_dict = evaluate(
	neox_args=neox_args,
	forward_step_fn=forward_step_func,
	data_iterator=data_iterator,
	model=model,
	verbose=verbose,
	timers=timers,
	reference_model=reference_model,
	)
	string = f" {chart_name} results at {prefix} \| "
	for k, v in total_loss_dict.items():
	if isinstance(v, dict):
	if neox_args.eval_tasks and "results" in v:
	v = v["results"]
	print(v)
	for k2, v2 in v.items():
	k3 = "_".join([k, k2])
	string += f"{k3} value: {v2:.6E} \| "
	tb_wandb_log(
	f"{chart_name}/{k3}",
	v2,
	iteration,
	use_wandb=neox_args.use_wandb,
	tensorboard_writer=neox_args.tensorboard_writer,
	comet_experiment=neox_args.comet_experiment,
	)
	else:
	string += f"{k} value: {v:.6E} \| "
	tb_wandb_log(
	f"{chart_name}/{k}",
	v,
	iteration,
	use_wandb=neox_args.use_wandb,
	tensorboard_writer=neox_args.tensorboard_writer,
	comet_experiment=neox_args.comet_experiment,
	)

	length = len(string) + 1
	print_rank_0("-" * length)
	print_rank_0(string)
	print_rank_0("-" * length)


	def save_snapshot(neox_args):
	assert (
	neox_args.memory_profiling_path is not None
	), "Must pass memory_profiling_path config arg to use profiling"
	snapshot = torch.cuda.memory._snapshot()
	snapshot_path = os.path.join(neox_args.memory_profiling_path)
	if not os.path.exists(snapshot_path):
	os.makedirs(snapshot_path)
	with open(os.path.join(snapshot_path, "mem_snapshot.pickle"), "wb") as f:
	dump(snapshot, f)