VRAM-estimator / estimate_train_vram.py
tvosch's picture
hardcode mixed precision off for inference
1023f27
from vram_helpers import activations_memory_per_layer, \
model_memory, \
gradients_memory, \
optimizer_memory, \
activations_memory, \
kv_cache_memory
def training_vram_required(model_config, training_config):
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
trainable_parameters = model_config.model_size
if training_config.qlora:
model_config.precision = "int4"
# 0.2% according to LoRA paper (https://arxiv.org/pdf/2106.09685)
trainable_parameters = 0.0002 * model_config.model_size
model_vram = model_memory(parameters=trainable_parameters,
precision=model_config.precision,
mixed_precision=model_config.mixed_precision)
gradients_vram = gradients_memory(parameters=trainable_parameters)
optimizer_vram = optimizer_memory(parameters=trainable_parameters, optimizer=training_config.optimizer)
# Baseline
if training_config.zero_stage == 0:
pass
# Optimizer state partitioning
if training_config.zero_stage >= 1:
optimizer_vram = optimizer_vram / training_config.num_gpus
# Gradient + Optimzer state partitioning
if training_config.zero_stage >= 2:
gradients_vram = gradients_vram / training_config.num_gpus
# Parameter partitioning + Gradient + Optimizer partitioning
if training_config.zero_stage == 3:
aggregated_vram = model_vram / training_config.num_gpus
aggregated_vram = model_vram + gradients_vram + optimizer_vram
activations_vram = activations_memory(model_config.num_layers,
model_config.sequence_length,
training_config.micro_batch_size,
model_config.hidden_size,
model_config.num_heads)
if training_config.gradient_checkpointing:
activations_vram = round(activations_vram ** 0.5, 2)
total_vram = aggregated_vram + activations_vram
return {k: round(v, 2) for k, v in {
"total": total_vram,
"model": model_vram,
"gradients": gradients_vram,
"optimizer": optimizer_vram,
"activations": activations_vram
}.items()}
def inference_vram_required(model_config, training_config):
model_config.mixed_precision = False
# Total inference VRAM = model size + KV cache size + activations + additional overhead
model_vram = model_memory(parameters=model_config.model_size,
precision=model_config.precision,
mixed_precision=model_config.mixed_precision)
kv_cache_vram = kv_cache_memory(batch_size=training_config.micro_batch_size,
total_sequence_length=model_config.total_sequence_length,
num_layers=model_config.num_layers,
num_heads=model_config.num_heads,
hidden_size=model_config.hidden_size,
precision=model_config.precision)
activations_vram = activations_memory_per_layer(sequence_length=model_config.sequence_length,
micro_batch_size=training_config.micro_batch_size,
hidden_size=model_config.hidden_size,
num_heads=model_config.num_heads)
total_vram = model_vram + kv_cache_vram + activations_vram
return {k: round(v, 2) for k, v in {
"total": total_vram,
"model": model_vram,
"kv_cache": kv_cache_vram,
"activations": activations_vram
}.items()}