"""Single-GPU 300M training-step benchmark.

No DeepSpeed, no accelerate — just torch + the model. Measures forward +
backward + optimizer step throughput on 1× RTX 5090 (32 GB).

Run:
  cd /root/bitnet1/code
  /venv/main/bin/python _bench_300m_1gpu.py [--max-steps 50] [--per-gpu-bs 1] [--grad-accum 4]
"""
import os, time, math, argparse
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
os.environ.setdefault('TORCHINDUCTOR_CACHE_DIR', '/root/bitnet1/inductor_cache')
import numpy as np
import torch
import torch.nn.functional as F
import torch._inductor.config as _ic
_ic.max_autotune_gemm_backends = "ATEN"
_ic.coordinate_descent_tuning = True
_ic.epilogue_fusion = True

from model_v47b import BitLMv47B
import model_v16 as v16


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--max-steps', type=int, default=50)
    ap.add_argument('--per-gpu-bs', type=int, default=1)
    ap.add_argument('--grad-accum', type=int, default=4)
    ap.add_argument('--checkpoint', action='store_true', default=True)
    ap.add_argument('--no-checkpoint', dest='checkpoint', action='store_false')
    ap.add_argument('--eager', action='store_true')
    args = ap.parse_args()

    device = 'cuda:0'
    torch.cuda.set_device(0)

    # 300M architecture (same as production)
    vocab_size = 16384
    d_model = 1536; n_layers = 16; n_heads = 24; d_ff = 1536
    seq_len = 2048
    bs = args.per_gpu_bs
    ga = args.grad_accum
    eff_tok = bs * ga * seq_len

    print(f'[bench] arch: d_model={d_model} n_layers={n_layers} n_heads={n_heads} '
          f'd_ff={d_ff} seq_len={seq_len}', flush=True)
    print(f'[bench] bs={bs} grad_accum={ga} eff_tok/step={eff_tok}', flush=True)

    v16.set_gumbel_tau(0.1)
    m = BitLMv47B(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers,
                   n_heads=n_heads, d_ff=d_ff, max_seq_len=seq_len,
                   slope_groups=8).to(device)
    n_params = sum(p.numel() for p in m.parameters())
    print(f'[bench] {n_params/1e6:.1f}M params', flush=True)

    # Cast ALiBi to fp32
    for blk in m.blocks:
        if blk.attn.alibi_bias.dtype != torch.float32:
            blk.attn.alibi_bias = blk.attn.alibi_bias.float()

    if not args.eager:
        m = torch.compile(m, mode='default', dynamic=False, fullgraph=False)
        print('[bench] compiled', flush=True)

    body = [p for n, p in m.named_parameters() if 'embed' not in n
              and 'codebook' not in n and 'logit_scale' not in n]
    small = [p for n, p in m.named_parameters() if 'embed' in n
              or 'codebook' in n or 'logit_scale' in n]
    opt = torch.optim.AdamW(
        [{'params': body, 'weight_decay': 0.1},
         {'params': small, 'weight_decay': 0.0}],
        lr=4e-4, betas=(0.9, 0.95), eps=1e-8, fused=True,
    )

    # Random data — we're measuring throughput, not loss
    rng = np.random.RandomState(42)
    train_arr = np.memmap('/root/bitnet1/data_fineweb_edu/train.bin',
                            dtype=np.uint16, mode='r')

    def get_batch():
        ix = rng.randint(0, len(train_arr) - seq_len - 1, size=bs)
        x = torch.empty(bs, seq_len, dtype=torch.int64, pin_memory=True)
        y = torch.empty(bs, seq_len, dtype=torch.int64, pin_memory=True)
        for i, s in enumerate(ix):
            x[i].copy_(torch.from_numpy(train_arr[s:s+seq_len].astype(np.int64)))
            y[i].copy_(torch.from_numpy(train_arr[s+1:s+1+seq_len].astype(np.int64)))
        return x.to(device, non_blocking=True), y.to(device, non_blocking=True)

    print(f'[bench] running {args.max_steps} steps...', flush=True)
    t0 = time.time()
    losses = []
    train_started = None
    for step in range(1, args.max_steps + 1):
        opt.zero_grad(set_to_none=True)
        accum = 0.0
        for _ in range(ga):
            x, y = get_batch()
            with torch.autocast('cuda', dtype=torch.bfloat16):
                logits, _ = m(x, None, use_checkpoint=args.checkpoint)
                loss = F.cross_entropy(logits.reshape(-1, vocab_size), y.reshape(-1)) / ga
            loss.backward()
            accum += loss.detach().float().item()
        torch.nn.utils.clip_grad_norm_([p for g in opt.param_groups for p in g['params']], 1.0)
        opt.step()
        torch.cuda.synchronize()

        if step == 1:
            print(f'[bench] step 1 done (warmup+compile): {time.time()-t0:.1f}s, '
                  f'loss={accum:.3f}', flush=True)
            train_started = time.time()
        elif step == 5:
            # Reset timer to skip remaining compile
            train_started = time.time()
            print(f'[bench] step 5  loss={accum:.3f}', flush=True)
        elif step % 10 == 0:
            elapsed = time.time() - train_started
            steps_done = step - 5
            tok = steps_done * eff_tok
            print(f'[bench] step {step}  loss={accum:.3f}  '
                  f'{tok/max(1,elapsed):.0f} tok/s  ({elapsed:.1f}s)', flush=True)

    elapsed = time.time() - train_started
    steps_done = args.max_steps - 5
    tok = steps_done * eff_tok
    print(f'\n[bench] FINAL: {tok/max(1,elapsed):.0f} tok/s steady-state '
          f'({steps_done} steps, {elapsed:.1f}s, {tok:,} tokens, {n_params/1e6:.1f}M params)',
          flush=True)


if __name__ == '__main__':
    main()