akswelh
/

NEOX

Model card Files Files and versions Community

NEOX / tests /model /test_model_instantiation.py

akswelh's picture

Upload 251 files

d90b3a8 verified about 1 month ago

3.95 kB

	# Copyright (c) 2024, EleutherAI
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	instantiate models with different configurations as a first possible point of failure
	"""

	import pytest

	import torch
	import os
	from tests.common import (
	DistributedTest,
	model_setup,
	clear_test_dirs,
	parametrize,
	binary,
	)

	PARAMS_TO_TEST = {
	"pipe_parallel_size,model_parallel_size,world_size": [
	[0, 1, 1],
	[1, 2, 2],
	[0, 2, 2],
	],
	"no_weight_tying": binary,
	"attention_config": [
	[[["global"], "all"]],
	[[["local"], "all"]],
	[[["sparse_variable"], "all"]],
	[[["sparse_fixed"], "all"]],
	],
	"scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [
	[True, False],
	[False, True],
	],
	"fp16,fp32_allreduce": [
	[
	{
	"enabled": True,
	"type": "bfloat16",
	"loss_scale": 0,
	"loss_scale_window": 1000,
	"hysteresis": 2,
	"min_loss_scale": 1,
	},
	True,
	],
	[
	{
	"enabled": True,
	"loss_scale": 0,
	"loss_scale_window": 1000,
	"hysteresis": 2,
	"min_loss_scale": 1,
	},
	False,
	],
	],
	}

	parameters, names = parametrize(
	PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
	)


	@pytest.mark.xfail(
	reason="Either fused kernels are not installed, or Cannot re-initialize CUDA in forked subprocess'"
	)
	@pytest.mark.parametrize("param_dict", parameters, ids=names)
	def test_instantiate(param_dict):
	t1 = test_instantiate_optimizers_class()
	t1.run_test_model_instantiation(param_dict)


	OPTIMIZER_PARAMS = {
	"optimizer": [
	{"type": "adam", "params": {"lr": 0.0006}},
	{"type": "onebitadam", "params": {"lr": 0.0006}},
	{"type": "cpu_adam", "params": {"lr": 0.0006}},
	{"type": "cpu_torch_adam", "params": {"lr": 0.0006}},
	{"type": "sm3", "params": {"lr": 0.0006}},
	{"type": "lion", "params": {"lr": 0.0006}},
	{"type": "madgrad_wd", "params": {"lr": 0.0006}},
	]
	}
	opt_params, opt_name = parametrize(
	OPTIMIZER_PARAMS, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
	)


	@pytest.mark.xfail(
	reason="Either fused kernels are not installed, or 'Cannot re-initialize CUDA in forked subprocess'"
	)
	@pytest.mark.parametrize("param_dict", opt_params, ids=opt_name)
	def test_instantiate_optimizers(param_dict):
	t1 = test_instantiate_optimizers_class()
	t1.run_test_model_instantiation(param_dict)


	class test_instantiate_optimizers_class(DistributedTest):
	world_size = 2

	def run_test_model_instantiation(yaml_list=None, param_dict=None):
	from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine

	model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
	if args_loaded.pipe_parallel_size < 2:
	assert isinstance(
	model, DeepSpeedEngine
	), "test model instantiation " + str(yaml_list)
	else:
	assert isinstance(model, PipelineEngine), "test model instantiation " + str(
	yaml_list
	)
	if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
	clear_test_dirs()