iter_num: 7380 train_loss: 0.5264 val_loss: 1.3034 config: {'out_dir': 'out-rust', 'eval_interval': 20, 'log_interval': 1, 'eval_iters': 40, 'eval_only': False, 'always_save_checkpoint': True, 'init_from': 'resume', 'model_interval': 500, 'wandb_log': False, 'wandb_project': 'shakespeare', 'wandb_run_name': 'ft-1689415282.8434372', 'dataset': 'rust', 'gradient_accumulation_steps': 32, 'batch_size': 1, 'block_size': 1024, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'dropout': 0.0, 'bias': False, 'learning_rate': 0.0006, 'max_iters': 600000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'decay_lr': True, 'warmup_iters': 2000, 'lr_decay_iters': 600000, 'min_lr': 6e-05, 'backend': 'nccl', 'device': 'cuda', 'dtype': 'float16', 'compile': True}