akswelh
/

NEOX

Model card Files Files and versions Community

NEOX / tests /config /test_setup.yml

akswelh's picture

Upload 251 files

d90b3a8 verified about 1 month ago

2.02 kB

	# 19M parameter model, & local setup with some additional simplifications
	{
	# Settings to make the test setup as lightweight as possible
	"data_path": "data/enwik8/enwik8_text_document",
	"vocab_file": "data/gpt2-vocab.json",
	"merge_file": "data/gpt2-merges.txt",
	"lr_decay_iters": 20,
	"train_iters": 20,
	"hostfile": "None",
	"include": "localhost:1",
	"use_wandb": False,

	# Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly)
	"pipe_parallel_size": 1,
	"model_parallel_size": 1,

	# model settings
	"num_layers": 2,
	"hidden_size": 8,
	"num_attention_heads": 4,
	"seq_length": 1024,
	"max_position_embeddings": 1024,
	"pos_emb": "rotary",
	"no_weight_tying": true,
	"gpt_j_residual": false,
	"output_layer_parallelism": "column",

	"scaled_upper_triang_masked_softmax_fusion": false,
	"bias_gelu_fusion": false,
	"rope_fusion": false,
	"layernorm_fusion": false,

	# Optimizer
	"optimizer": {
	"type": "sm3",
	"params": {},
	},

	# precision
	"precision": "fp16",

	# init methods
	"init_method": "small_init",
	"output_layer_init_method": "wang_init",

	"train_micro_batch_size_per_gpu": 4,
	"gradient_accumulation_steps": 1,
	"data_impl": "mmap",
	"num_workers": 1,

	# activation checkpointing
	"checkpoint_activations": true,
	"checkpoint_num_layers": 1,
	"partition_activations": true,
	"synchronize_each_layer": true,

	# regularization
	"gradient_clipping": 1.0,
	"weight_decay": 0.1,
	"hidden_dropout": 0,
	"attention_dropout": 0,

	"distributed_backend": "nccl",
	"lr_decay_style": "cosine",
	"warmup": 0.01,
	"checkpoint_factor": 1000,
	"eval_interval": 100000,
	"eval_iters": 10,

	"log_interval": 10,
	"steps_per_print": 10,
	"wall_clock_breakdown": true,

	# additional deepspeed args not specified above
	"deepspeed_extra_args": {
	"comms_logger": {
	"enabled": true,
	"verbose": true,
	"prof_all": true,
	"debug": false
	},
	}
	}