AnshulRanjan2004 commited on Sep 25, 2024

Commit

c50fe14

verified ·

1 Parent(s): 7af0576

Uploading the Model

Browse files

Files changed (36) hide show

.gitattributes +1 -0
.gitignore +169 -0
.timetracker +1 -0
assets/benchmark.png +0 -0
assets/channel_mixing.gif +0 -0
assets/current_loss.png +0 -0
assets/gpt2_124M_loss.png +0 -0
assets/inference-time.png +0 -0
assets/nanoRWKV-loss.png +0 -0
assets/nanoRWKV.png +0 -0
assets/nanorwkv.jpg +0 -0
assets/time_mixing.gif +3 -0
bench.py +117 -0
benchmark_inference_time.py +130 -0
config/eval_gpt2.py +8 -0
config/eval_rwkv4_169m.py +7 -0
config/eval_rwkv4_430m.py +6 -0
config/finetune_shakespeare.py +25 -0
config/train_gpt2.py +26 -0
config/train_rwkv.py +35 -0
config/train_shakespeare_char.py +37 -0
configurator.py +47 -0
data/openwebtext/prepare.py +80 -0
data/openwebtext/readme.md +15 -0
data/shakespeare/prepare.py +33 -0
data/shakespeare/readme.md +9 -0
data/shakespeare_char/prepare.py +68 -0
data/shakespeare_char/readme.md +9 -0
generate.py +84 -0
modelGenerate.py +442 -0
modeling_rwkv.py +687 -0
out/.keep +0 -0
sample.py +101 -0
scaling_laws.ipynb +0 -0
train.py +363 -0
transformer_sizing.ipynb +402 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.ipynb linguist-generated

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.ipynb linguist-generated
+assets/time_mixing.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+/data/summary/*
+/data/tinystories-15k/*
+/out/*.pt
+/venv/
+/keysModel.py
+/model.py
+/*.txt
+/trainKaggle.py

.timetracker ADDED Viewed

	@@ -0,0 +1 @@

+ {"total":626936,"sessions":[{"begin":"2024-03-19T15:28:08+07:00","end":"2024-03-19T16:59:43+07:00","duration":5494},{"begin":"2024-03-24T22:41:12+07:00","end":"2024-03-24T23:05:12+07:00","duration":1439},{"begin":"2024-03-26T08:49:00+07:00","end":"2024-03-26T10:38:06+07:00","duration":6546},{"begin":"2024-03-26T23:06:50+07:00","end":"2024-03-26T23:35:02+07:00","duration":1691},{"begin":"2024-03-26T23:51:53+07:00","end":"2024-03-27T00:28:51+07:00","duration":2218},{"begin":"2024-03-27T00:32:45+07:00","end":"2024-03-27T12:27:12+07:00","duration":42866},{"begin":"2024-03-28T10:46:58+07:00","end":"2024-03-28T13:00:02+07:00","duration":7983},{"begin":"2024-03-28T16:28:10+07:00","end":"2024-03-29T08:42:21+07:00","duration":58451},{"begin":"2024-03-29T20:37:47+07:00","end":"2024-03-30T14:44:24+07:00","duration":65196},{"begin":"2024-03-30T14:44:24+07:00","end":"2024-03-31T16:45:41+07:00","duration":93676},{"begin":"2024-03-31T17:03:36+07:00","end":"2024-03-31T17:04:06+07:00","duration":30},{"begin":"2024-03-31T17:04:13+07:00","end":"2024-04-01T15:02:44+07:00","duration":79111},{"begin":"2024-04-02T12:23:23+07:00","end":"2024-04-05T13:13:59+07:00","duration":262235}]}

assets/benchmark.png ADDED Viewed

assets/channel_mixing.gif ADDED Viewed

assets/current_loss.png ADDED Viewed

assets/gpt2_124M_loss.png ADDED Viewed

assets/inference-time.png ADDED Viewed

assets/nanoRWKV-loss.png ADDED Viewed

assets/nanoRWKV.png ADDED Viewed

assets/nanorwkv.jpg ADDED Viewed

assets/time_mixing.gif ADDED Viewed

Git LFS Details

SHA256: 1b07929960fc4a44998495330d5501a2ae24b1b5dac143fc408255bc054d1527
Pointer size: 132 Bytes
Size of remote file: 3.2 MB

bench.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+A much shorter version of train.py for benchmarking
+"""
+import os
+from contextlib import nullcontext
+import numpy as np
+import time
+import torch
+from modeling_gpt import GPTConfig, GPT
+# -----------------------------------------------------------------------------
+batch_size = 12
+block_size = 1024
+bias = False
+real_data = True
+seed = 1337
+device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
+compile = True # use PyTorch 2.0 to compile the model to be faster
+profile = False # use pytorch profiler, or just simple benchmarking?
+exec(open('configurator.py').read()) # overrides from command line or config file
+# -----------------------------------------------------------------------------
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# data loading init
+if real_data:
+    dataset = 'openwebtext'
+    data_dir = os.path.join('data', dataset)
+    train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+    def get_batch(split):
+        data = train_data # note ignore split in benchmarking script
+        ix = torch.randint(len(data) - block_size, (batch_size,))
+        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+        return x, y
+else:
+    # alternatively, if fixed data is desired to not care about data loading
+    x = torch.randint(50304, (batch_size, block_size), device=device)
+    y = torch.randint(50304, (batch_size, block_size), device=device)
+    get_batch = lambda split: (x, y)
+# model init
+gptconf = GPTConfig(
+    block_size = block_size, # how far back does the model look? i.e. context size
+    n_layer = 12, n_head = 12, n_embd = 768, # size of the model
+    dropout = 0, # for determinism
+    bias = bias,
+)
+model = GPT(gptconf)
+model.to(device)
+optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
+if compile:
+    print("Compiling model...")
+    model = torch.compile(model) # pytorch 2.0
+if profile:
+    # useful docs on pytorch profiler:
+    # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
+    # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
+    wait, warmup, active = 5, 5, 5
+    num_steps = wait + warmup + active
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
+        record_shapes=False,
+        profile_memory=False,
+        with_stack=False, # incurs an additional overhead, disable if not needed
+        with_flops=True,
+        with_modules=False, # only for torchscript models atm
+    ) as prof:
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+            prof.step() # notify the profiler at end of each step
+else:
+    # simple benchmarking
+    torch.cuda.synchronize()
+    for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
+        t0 = time.time()
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+        torch.cuda.synchronize()
+        t1 = time.time()
+        dt = t1-t0
+        mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
+        if stage == 1:
+            print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")

benchmark_inference_time.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from torch.profiler import ProfilerActivity, profile, record_function
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from torch import nn
+import torch
+torch.set_float32_matmul_precision('high')
+import json
+from argparse import ArgumentParser
+def sample(outputs):
+    next_token_logits = outputs.logits[:, -1, :]
+    probs = nn.functional.softmax(next_token_logits, dim=-1)
+    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+    return next_tokens
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--device",default='cuda')
+    parser.add_argument("--model",required=True)
+    parser.add_argument("--use_cache",action='store_true')
+    parser.add_argument("--max_new_tokens",type=int,default=16_000)
+    parser.add_argument("--output_path")
+    args = parser.parse_args()
+    prompt = 'hello' ## dummpy input
+    config = AutoConfig.from_pretrained(args.model)
+    config.max_position_embeddings = args.max_new_tokens+10
+    model = AutoModelForCausalLM.from_config(config)
+    model.eval()
+    model = model.to(args.device)
+    model = torch.compile(model)
+    model_size = sum(p.numel() for p in model.parameters())
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenized_prompt = tokenizer(prompt, return_tensors="pt")
+    tokenized_prompt = tokenized_prompt['input_ids'].to(args.device)
+    model_input = {
+        "input_ids":tokenized_prompt,
+        "use_cache":args.use_cache,
+    }
+    cache_name = "state" if args.model.startswith("RWKV") else "past_key_values"
+    model_input[cache_name]=None
+    os.makedirs(os.path.dirname(args.output_path),exist_ok=True)
+    writer = open(args.output_path,'w')
+    for tok_idx in range(args.max_new_tokens):
+        with torch.no_grad():
+            if args.use_cache and model_input[cache_name] is not None:model_input["input_ids"] = tokenized_prompt[:,-1:].to(args.device)
+            else:model_input["input_ids"] = tokenized_prompt.to(args.device)
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=False) as prof:
+                with record_function("model_inference"):
+                    output = model.forward(**model_input)
+        model_input[cache_name]=getattr(output,cache_name)
+        next_tokens = sample(output)
+        tokenized_prompt = torch.cat([tokenized_prompt.cpu(), next_tokens[:, None].cpu()], dim=-1)
+        full_profile = next(event for event in prof.key_averages() if event.key == 'model_inference')
+        writer.write(json.dumps({
+            "model_name": args.model,
+            "model_size": model_size,
+            "token_id": tok_idx,
+            "strategy": args.device,
+            "cpu_time": full_profile.cpu_time,
+            "cuda_time": full_profile.cuda_time,
+            "cpu_memory_usage": full_profile.cpu_memory_usage,
+            "cuda_memory_usage": full_profile.cuda_memory_usage,
+            "self_cpu_memory_usage": full_profile.self_cpu_memory_usage,
+            "self_cuda_memory_usage": full_profile.self_cuda_memory_usage,
+            "max_memory_allocated":torch.cuda.max_memory_allocated(),
+        })+'\n'
+        )
+        torch.cuda.empty_cache()
+    writer.close()
+"""
+python benchmark_inference_time.py --model RWKV/rwkv-4-3b-pile --use_cache --output_path data/inference_time/rwkv-3b.jsonl
+python benchmark_inference_time.py --model RWKV/rwkv-4-7b-pile --use_cache --output_path data/inference_time/rwkv-7b.jsonl
+python benchmark_inference_time.py --model RWKV/rwkv-4-14b-pile --use_cache --output_path data/inference_time/rwkv-14b.jsonl
+python benchmark_inference_time.py --model facebook/opt-2.7b --use_cache --output_path data/inference_time/opt-2.7b.jsonl
+python benchmark_inference_time.py --model facebook/opt-6.7b --use_cache --output_path data/inference_time/opt-6.7b.jsonl
+python benchmark_inference_time.py --model EleutherAI/pythia-2.8b --use_cache --output_path data/inference_time/pythia-2.8b.jsonl
+python benchmark_inference_time.py --model EleutherAI/pythia-6.9b --use_cache --output_path data/inference_time/pythia-6.9b.jsonl
+python benchmark_inference_time.py --model EleutherAI/gpt-neo-2.7B --use_cache --output_path data/inference_time/gpt-neo-2.7B.jsonl
+############# Poltting Code ##############
+import numpy as np
+import json
+def get_jsonl(f): return [json.loads(x) for x in open(f).readlines()]
+import matplotlib.pyplot as plt
+fig, (ax1,ax2,ax3) = plt.subplots(1, 3,figsize=(18, 4))
+for model_name in [
+    "rwkv-3b",
+    # "rwkv-7b",
+    # "rwkv-14b",
+    "opt-2.7b",
+    "gpt-neo-2.7B",
+    "pythia-2.8b"
+    ]:
+    data = get_jsonl(f"data/inference_time/{model_name}.jsonl")
+    cuda_time = [x['cuda_time'] for x in data]
+    cumulative_time = np.cumsum(cuda_time)/(1000*1000)
+    memory_usage = [x['max_memory_allocated']/(2**10)/(2**10)/(2**10) for x in data]
+    ax1.plot([x/1000 for x in cuda_time][100:],label=model_name)
+    ax2.plot(cumulative_time,label=model_name)
+    ax3.plot(memory_usage,label=model_name)
+ax1.set_xlabel("# Tokens")
+ax1.set_ylabel("Time (ms) to generated the #-th token")
+ax1.grid()
+ax1.legend()
+ax1.set_title("Single Token Generation Latency")
+ax2.set_xlabel("# Tokens")
+ax2.set_ylabel("Cumulative time (s) to generated the #-th token")
+ax2.grid()
+ax2.legend()
+ax2.set_title("Cumulative Generation Latency")
+ax3.set_xlabel("# Tokens")
+ax3.set_ylabel("Memory usage (GB)")
+ax3.grid()
+ax3.legend()
+ax3.set_title("Memory usage in Generation")
+"""

config/eval_gpt2.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# evaluate the base gpt2
+# n_layer=12, n_head=12, n_embd=768
+# 124M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2'

config/eval_rwkv4_169m.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# evaluate the RWKV-4-169M
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+dtype = 'float16' # v100 doesn't support bf16
+init_from = 'RWKV/rwkv-4-169m-pile'

config/eval_rwkv4_430m.py ADDED Viewed

	@@ -0,0 +1,6 @@

+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'RWKV/rwkv-4-430m-pile'
+dtype = 'float16' # v100 doesn't support bf16

config/finetune_shakespeare.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import time
+out_dir = 'out-shakespeare'
+eval_interval = 5
+eval_iters = 40
+wandb_log = False # feel free to turn on
+wandb_project = 'shakespeare'
+wandb_run_name = 'ft-' + str(time.time())
+dataset = 'shakespeare'
+init_from = 'gpt2-xl' # this is the largest GPT-2 model
+# only save checkpoints if the validation loss improves
+always_save_checkpoint = False
+# the number of examples per iter:
+# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
+# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
+batch_size = 1
+gradient_accumulation_steps = 32
+max_iters = 20
+# finetune at constant LR
+learning_rate = 3e-5
+decay_lr = False

config/train_gpt2.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+wandb_log = True
+wandb_project = 'nanoRWKV'
+wandb_run_name='gpt2-124M'
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 12
+block_size = 1024
+gradient_accumulation_steps = 5 * 8
+# this makes total number of tokens be 300B
+max_iters = 600000
+lr_decay_iters = 600000
+dtype = 'float16'
+# eval stuff
+eval_interval = 1000
+eval_iters = 200
+log_interval = 10
+# weight decay
+weight_decay = 1e-1

config/train_rwkv.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+wandb_log = True
+wandb_project = 'nanoRWKV'
+wandb_run_name='RWKV-130M'
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 12
+block_size = 1024
+gradient_accumulation_steps = 5 * 8
+# rwkv specific parameters
+dtype = 'float16' # v100 doesn't support bf16
+model_type = 'rwkv'
+# beta1 = 0.9
+# beta2 = 0.99
+# learning_rate = 8e-4
+# min_lr = 1e-5
+# warmup_iters = 0
+weight_decay = 1e-1
+use_customized_cuda_kernel = True
+# this makes total number of tokens be 300B
+max_iters = 600000
+lr_decay_iters = 600000
+# eval stuff
+eval_interval = 1000
+eval_iters = 200
+log_interval = 10

config/train_shakespeare_char.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+out_dir = 'out-shakespeare-char'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+wandb_log = False # override via command line if you like
+wandb_project = 'shakespeare-char'
+wandb_run_name = 'mini-gpt'
+dataset = 'shakespeare_char'
+gradient_accumulation_steps = 1
+batch_size = 64
+block_size = 256 # context of up to 256 previous characters
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+learning_rate = 1e-3 # with baby networks can afford to go a bit higher
+max_iters = 5000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+warmup_iters = 100 # not super necessary potentially
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model

configurator.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+import sys
+from ast import literal_eval
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            assert type(attempt) == type(globals()[key])
+            # cross fingers
+            print(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")

data/openwebtext/prepare.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# saves the openwebtext dataset to a binary file for training. following was helpful:
+# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+import os
+from tqdm import tqdm
+import numpy as np
+import tiktoken
+from datasets import load_dataset # huggingface datasets
+# number of workers in .map() call
+# good number to use is ~order number of cpu cores // 2
+num_proc = 8
+# number of workers in load_dataset() call
+# best number might be different from num_proc above as it also depends on NW speed.
+# it is better than 1 usually though
+num_proc_load_dataset = num_proc
+if __name__ == '__main__':
+    # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
+    dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
+    # owt by default only contains the 'train' split, so create a test split
+    split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
+    split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
+    # this results in:
+    # >>> split_dataset
+    # DatasetDict({
+    #     train: Dataset({
+    #         features: ['text'],
+    #         num_rows: 8009762
+    #     })
+    #     val: Dataset({
+    #         features: ['text'],
+    #         num_rows: 4007
+    #     })
+    # })
+    # we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
+    enc = tiktoken.get_encoding("gpt2")
+    def process(example):
+        ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
+        ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
+        # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
+        out = {'ids': ids, 'len': len(ids)}
+        return out
+    # tokenize the dataset
+    tokenized = split_dataset.map(
+        process,
+        remove_columns=['text'],
+        desc="tokenizing the splits",
+        num_proc=num_proc,
+    )
+    # concatenate all the ids in each dataset into one large file we can use for training
+    for split, dset in tokenized.items():
+        arr_len = np.sum(dset['len'], dtype=np.uint64)
+        filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
+        dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
+        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
+        total_batches = 1024
+        idx = 0
+        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
+            # Batch together samples for faster write
+            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
+            arr_batch = np.concatenate(batch['ids'])
+            # Write into mmap
+            arr[idx : idx + len(arr_batch)] = arr_batch
+            idx += len(arr_batch)
+        arr.flush()
+    # train.bin is ~17GB, val.bin ~8.5MB
+    # train has ~9B tokens (9,035,582,198)
+    # val has ~4M tokens (4,434,897)
+    # to read the bin files later, e.g. with numpy:
+    # m = np.memmap('train.bin', dtype=np.uint16, mode='r')

data/openwebtext/readme.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## openwebtext dataset
+after running `prepare.py` (preprocess) we get:
+- train.bin is ~17GB, val.bin ~8.5MB
+- train has ~9B tokens (9,035,582,198)
+- val has ~4M tokens (4,434,897)
+this came from 8,013,769 documents in total.
+references:
+- OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
+- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset

data/shakespeare/prepare.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import requests
+import tiktoken
+import numpy as np
+# download the tiny shakespeare dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+if not os.path.exists(input_file_path):
+    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+    with open(input_file_path, 'w') as f:
+        f.write(requests.get(data_url).text)
+with open(input_file_path, 'r') as f:
+    data = f.read()
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+# encode with tiktoken gpt2 bpe
+enc = tiktoken.get_encoding("gpt2")
+train_ids = enc.encode_ordinary(train_data)
+val_ids = enc.encode_ordinary(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+# train.bin has 301,966 tokens
+# val.bin has 36,059 tokens

data/shakespeare/readme.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# tiny shakespeare
+Tiny shakespeare, of the good old char-rnn fame :)
+After running `prepare.py`:
+- train.bin has 301,966 tokens
+- val.bin has 36,059 tokens

data/shakespeare_char/prepare.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Prepare the Shakespeare dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+# download the tiny shakespeare dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+if not os.path.exists(input_file_path):
+    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+    with open(input_file_path, 'w') as f:
+        f.write(requests.get(data_url).text)
+with open(input_file_path, 'r') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+# create the train and test splits
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+# encode both to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
+# length of dataset in characters:  1115394
+# all the unique characters:
+#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+# vocab size: 65
+# train has 1003854 tokens
+# val has 111540 tokens

data/shakespeare_char/readme.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# tiny shakespeare, character-level
+Tiny shakespeare, of the good old char-rnn fame :) Treated on character-level.
+After running `prepare.py`:
+- train.bin has 1,003,854 tokens
+- val.bin has 111,540 tokens

generate.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import argparse
+import tiktoken
+import torch
+import time
+from modelGenerate import GPT
+from dataclasses import dataclass
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--prompt', type=str, required=True,
+                    help='Prompt for generation')
+parser.add_argument('--max_num_tokens', type=int, default=100,
+                    help='Maximum number of tokens to generate')
+parser.add_argument('--model_name', type=str, required=True,
+                    help='Name of the model checkpoint')
+args = parser.parse_args()
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    vocab_size: int = 50304
+    n_layer: int = 8
+    n_head: int = 8
+    n_embd: int = 768
+    num_experts: int = 4
+    num_active_experts: int = 4
+    expert_dim: int = 512
+    dim: int = 768
+    dropout: float = 0.0
+    # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    bias: bool = False
+# Load the model checkpoint
+ckpt_path = os.path.join('./out', f'{args.model_name}.pt')
+checkpoint = torch.load(ckpt_path,torch.device('cpu'))
+print(checkpoint['config'])
+model_args = checkpoint['model_args']
+gptconf = GPTConfig(**model_args)
+model = GPT(gptconf)
+model.load_state_dict(checkpoint['model'])
+# model.cuda()
+model.eval()
+# Encode the prompt using tiktoken
+enc = tiktoken.get_encoding("gpt2")
+prompt_ids = enc.encode_ordinary(args.prompt)
+# Measure inference time
+start_time = time.time()  # Get the current time before generating text
+generated = model.generate(torch.tensor(
+    [prompt_ids], device='cpu'), max_new_tokens=args.max_num_tokens)
+end_time = time.time()  # Get the current time after generating text
+inference_time = end_time - start_time  # Calculate inference time in seconds
+# Convert seconds to more readable format
+if inference_time >= 3600:
+    hours = int(inference_time // 3600)
+    minutes = int((inference_time % 3600) // 60)
+    seconds = int(inference_time % 60)
+    inference_time_str = f"{hours} hours {minutes} minutes {seconds} seconds"
+elif inference_time >= 60:
+    minutes = int(inference_time // 60)
+    seconds = int(inference_time % 60)
+    inference_time_str = f"{minutes} minutes {seconds} seconds"
+else:
+    seconds = int(inference_time)
+    inference_time_str = f"{seconds} seconds"
+output = enc.decode(generated[0].tolist())
+print(f"Prompt: {args.prompt}")
+print(f"Generated text: {output}")
+print(f"Generated text length: {len(output)}")
+print(f"Inference time: {inference_time_str}")

modelGenerate.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class RWKV_TimeMix_x051a(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.head_size = config.n_embd // config.n_head
+        self.n_head = config.n_head
+        with torch.no_grad():
+            ratio_0_to_1 = layer_id / (config.n_layer - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / config.n_layer)  # 1 to ~0
+            ddd = torch.ones(1, 1, config.n_embd)
+            for i in range(config.n_embd):
+                ddd[0, 0, i] = i / config.n_embd
+            self.time_maa_k = nn.Parameter(
+                1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            self.time_maa_v = nn.Parameter(
+                1.0 - (torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1))
+            self.time_maa_r = nn.Parameter(
+                1.0 - torch.pow(ddd, 0.5 * ratio_1_to_almost0))
+            self.time_maa_g = nn.Parameter(
+                1.0 - torch.pow(ddd, 0.5 * ratio_1_to_almost0))
+            decay_speed = torch.ones(self.n_head)
+            for h in range(self.n_head):
+                decay_speed[h] = -6 + 5 * \
+                    (h / (self.n_head - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+            self.time_decay = nn.Parameter(decay_speed.unsqueeze(-1))
+            tmp = torch.zeros(self.n_head)
+            for h in range(self.n_head):
+                tmp[h] = ratio_0_to_1 * (1 - (h / (self.n_head - 1)))
+            self.time_faaaa = nn.Parameter(tmp.unsqueeze(-1))
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.receptance = nn.Linear(
+            config.n_embd, config.n_embd, bias=config.bias)
+        self.key = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.value = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.gate = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.output = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.ln_x = nn.GroupNorm(self.n_head, config.n_embd, eps=(1e-5)*64)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        H, N = self.n_head, self.head_size
+        if T % 256 == 0:
+            Q = 256
+        elif T % 128 == 0:
+            Q = 128
+        else:
+            Q = T
+        assert T % Q == 0
+        xx = self.time_shift(x) - x
+        xk = x + xx * self.time_maa_k
+        xv = x + xx * self.time_maa_v
+        xr = x + xx * self.time_maa_r
+        xg = x + xx * self.time_maa_g
+        r = self.receptance(xr).view(B, T, H, N).transpose(1, 2)  # receptance
+        k = self.key(xk).view(B, T, H, N).permute(0, 2, 3, 1)  # key
+        v = self.value(xv).view(B, T, H, N).transpose(1, 2)  # value
+        g = F.silu(self.gate(xg))  # extra gate
+        w = torch.exp(-torch.exp(self.time_decay.float()))  # time_decay
+        u = self.time_faaaa.float()  # time_first
+        ws = w.pow(Q).view(1, H, 1, 1)
+        ind = torch.arange(
+            Q-1, -1, -1, device=r.device).unsqueeze(0).repeat(H, 1)
+        w = w.repeat(1, Q).pow(ind)
+        wk = w.view(1, H, 1, Q)
+        wb = wk.transpose(-2, -1).flip(2)
+        w = torch.cat([w[:, 1:], u], dim=1)
+        w = F.pad(w, (0, Q))
+        w = torch.tile(w, [Q])
+        w = w[:, :-Q].view(-1, Q, 2*Q - 1)
+        w = w[:, :, Q-1:].view(1, H, Q, Q)
+        w = w.to(dtype=r.dtype)  # the decay matrix
+        wk = wk.to(dtype=r.dtype)
+        wb = wb.to(dtype=r.dtype)
+        ws = ws.to(dtype=r.dtype)
+        state = torch.zeros(B, H, N, N, device=r.device,
+                            dtype=r.dtype)  # state
+        y = torch.empty(B, H, T, N, device=r.device, dtype=r.dtype)  # output
+        for i in range(T // Q):  # the rwkv-x051a operator
+            rr = r[:, :, i*Q:i*Q+Q, :]
+            kk = k[:, :, :, i*Q:i*Q+Q]
+            vv = v[:, :, i*Q:i*Q+Q, :]
+            y[:, :, i*Q:i*Q+Q, :] = ((rr @ kk) * w) @ vv + (rr @ state) * wb
+            state = ws * state + (kk * wk) @ vv
+        y = y.transpose(1, 2).contiguous().view(B * T, C)
+        y = self.ln_x(y).view(B, T, C) * g
+        # output projection
+        y = self.dropout(self.output(y))
+        return y
+class RWKV_ChannelMix_x051a(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        with torch.no_grad():
+            ratio_1_to_almost0 = 1.0 - (layer_id / config.n_layer)
+            ddd = torch.ones(1, 1, config.n_embd)
+            for i in range(config.n_embd):
+                ddd[0, 0, i] = i / config.n_embd
+            self.time_maa_k = nn.Parameter(
+                1.0 - torch.pow(ddd, ratio_1_to_almost0))
+            self.time_maa_r = nn.Parameter(
+                1.0 - torch.pow(ddd, ratio_1_to_almost0))
+        self.key = nn.Linear(config.n_embd, 3 *
+                             config.n_embd, bias=config.bias)
+        self.value = nn.Linear(
+            3 * config.n_embd, config.n_embd, bias=config.bias)
+        self.receptance = nn.Linear(
+            config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        xx = self.time_shift(x) - x
+        xk = x + xx * self.time_maa_k
+        xr = x + xx * self.time_maa_r
+        x = self.key(xk)
+        x = torch.relu(x) ** 2
+        x = self.value(x)
+        x = torch.sigmoid(self.receptance(xr)) * x
+        x = self.dropout(x)
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-8):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / (norm + self.eps)
+class GroupedQAttention(nn.Module):
+    def __init__(self, dim, num_heads, groups=4):
+        super().__init__()
+        self.num_heads = num_heads
+        self.groups = groups
+        self.qkvw = nn.Linear(dim, dim * 4, bias=False)
+        self.out = nn.Linear(dim, dim, bias=False)
+    def forward(self, x):
+        batch, seq_len, dim = x.shape
+        qkvw = self.qkvw(x)  # GENERATE
+        qkvw_gropus = torch.chunk(qkvw, self.groups, dim=-1)  # GENERATE
+        q, k, v, w = [t.chunk(self.groups, dim=-1) for t in qkvw_gropus]
+        q, k, v, w = [
+            torch.cat([qi, ki, vi, wi], dim=0)
+            for qi, ki, vi, wi in zip(q, k, v, w)
+        ]
+        q, k, v = map(
+            lambda t: t.view(batch * self.groups, self.num_heads, -1,
+                             dim // self.num_heads // self.groups).transpose(1, 2),
+            [q, k, v]
+        )
+        w = w.view(batch * self.groups, self.num_heads, -
+                   1, dim // self.num_heads // self.groups)
+        attn_output = (q @ k.transpose(-2, -1)) * \
+            (dim // self.num_heads // self.groups) ** -0.5
+        attn_output = attn_output.softmax(dim=-1)
+        attn_output = (attn_output @ v).transpose(1,
+                                                  2).reshape(batch, seq_len, dim)
+        return self.out(attn_output * w.reshape(batch, seq_len, dim))
+class SlidingWindowAttention(nn.Module):
+    def __init__(self, dim, window_size, num_heads):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.proj = nn.Linear(dim, dim, bias=False)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.head_dim ** -0.5
+        # Pad to multiple of window size
+        padding = (self.window_size - N % self.window_size) % self.window_size
+        q = F.pad(q, (0, 0, 0, padding))
+        k = F.pad(k, (0, 0, 0, padding))
+        v = F.pad(v, (0, 0, 0, padding))
+        # Reshape to sliding windows
+        q = q.reshape(B * self.num_heads, self.window_size, -1)
+        k = k.reshape(B * self.num_heads, self.window_size, -1)
+        v = v.reshape(B * self.num_heads, self.window_size, -1)
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = attn @ v
+        attn = attn.reshape(B, self.num_heads, N + padding, self.head_dim)
+        attn = attn[:, :, :N, :].permute(0, 2, 1, 3).reshape(B, N, C)
+        return self.proj(attn)
+class TinyMoE(nn.Module):
+    def __init__(self, dim, num_experts, num_active_experts, expert_dim, dropout=0.0, expert_capacity_scale=1.0, aux_loss_weight=0.1):
+        super().__init__()
+        self.dim = dim
+        self.num_experts = num_experts
+        self.num_active_experts = num_active_experts
+        self.expert_dim = expert_dim
+        self.dropout = nn.Dropout(dropout)
+        self.gate = nn.Linear(dim, num_experts)
+        self.expert_capacity_scale = expert_capacity_scale
+        self.scaled_expert_dim = int(expert_dim * self.expert_capacity_scale)
+        self.experts = nn.ModuleList(
+            [nn.Linear(dim, self.scaled_expert_dim) for _ in range(num_active_experts)])
+        self.fc = nn.Linear(self.scaled_expert_dim, dim)
+        # Auxiliary loss
+        self.aux_loss_weight = aux_loss_weight
+        self.expert_diversity_loss = nn.MSELoss()
+    def forward(self, x):
+        b, n, d = x.shape
+        # Compute attention scores
+        scores = self.gate(x).view(b, n, self.num_experts)
+        scores = F.softmax(scores, dim=-1)
+        # Apply dropout to the attention scores
+        scores = self.dropout(scores)
+        # Compute the weighted sum of expert outputs
+        expert_outputs = torch.stack(
+            [exp(x.view(b * n, d)) for exp in self.experts], dim=1)
+        expert_outputs = expert_outputs.view(
+            b, n, self.num_active_experts, self.scaled_expert_dim)
+        weighted_outputs = (
+            expert_outputs * scores[:, :, :self.num_active_experts].unsqueeze(-1)).sum(dim=2)
+        # Apply the final linear layer
+        output = self.fc(weighted_outputs)
+        # Auxiliary loss: Expert diversity
+        # (b, num_active_experts, scaled_expert_dim)
+        expert_activations = expert_outputs.mean(dim=1)
+        expert_diversity_loss = self.expert_diversity_loss(expert_activations.transpose(
+            0, 1), torch.zeros_like(expert_activations.transpose(0, 1)))
+        return output, expert_diversity_loss * self.aux_loss_weight
+    def set_expert_capacity(self, expert_capacity_scale):
+        self.expert_capacity_scale = expert_capacity_scale
+        self.scaled_expert_dim = int(
+            self.expert_dim * self.expert_capacity_scale)
+        self.experts = nn.ModuleList([nn.Linear(
+            self.dim, self.scaled_expert_dim) for _ in range(self.num_active_experts)])
+        self.fc = nn.Linear(self.scaled_expert_dim, self.dim)
+class Block(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd)
+        self.ln_2 = RMSNorm(config.n_embd)
+        # stay in here because this is a core component
+        self.tmix = RWKV_TimeMix_x051a(config, layer_id)
+        # Add GroupedQAttention instance
+        self.grouped_attn = GroupedQAttention(config.n_embd, config.n_head)
+        # stay in here because this is a core component
+        self.cmix = RWKV_ChannelMix_x051a(config, layer_id)
+        self.sliding_attn = SlidingWindowAttention(
+            config.n_embd, window_size=256, num_heads=config.n_head)
+        self.moe = TinyMoE(config.dim, config.num_experts, config.num_active_experts,
+                           config.expert_dim, config.dropout, expert_capacity_scale=1.2, aux_loss_weight=0.01)
+    def forward(self, x):
+        x = x + self.tmix(self.ln_1(x))
+        x = x + self.cmix(self.ln_2(x))
+        x = x + self.sliding_attn(x)  # Apply sliding window attention
+        x = x + self.grouped_attn(self.tmix(x))  # Apply GroupedQAttention
+        # x = x + self.moe(x)  # Apply TinyMoE
+        moe_output, aux_loss = self.moe(x)
+        x = x + moe_output
+        return x
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            wpe=nn.Embedding(config.block_size, config.n_embd),
+            drop=nn.Dropout(config.dropout),
+            h=nn.ModuleList([Block(config, i) for i in range(config.n_layer)]),
+            ln_f=LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(
+            self.config.n_embd, self.config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('tmix.output.weight'):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02/math.sqrt(2 * self.config.n_layer))
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+    def get_num_params(self, non_embedding=True):
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)
+        # forward the GPT model itself
+        # token embeddings of shape (b, t, n_embd)
+        tok_emb = self.transformer.wte(idx)
+        # position embeddings of shape (t, n_embd)
+        pos_emb = self.transformer.wpe(pos)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            # note: using list [-1] to preserve the time dim
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, top_k=None):
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(
+                1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :]
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

modeling_rwkv.py ADDED Viewed

	@@ -0,0 +1,687 @@

+"""
+Full definition of a RWKV Language Model, all of it in this single file.
+References:
+1) the official RWKV PyTorch implementation released by Bo Peng:
+https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py
+2) huggingface/transformers PyTorch implementation:
+https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py
+"""
+import math,time
+import os
+import inspect
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+PREV_X_TIME = 0
+NUM_STATE = 1
+DEN_STATE = 2
+MAX_STATE = 3
+PREV_X_CHANNEL = 4
+# copied from nanoGPT
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+# learn from GPT-4
+from unittest.mock import patch
+class CudaNotAvailable:
+    def __enter__(self):
+        self.patcher = patch("torch.cuda.is_available", return_value=False)
+        self.patcher.start()
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.patcher.stop()
+# https://github.com/BlinkDL/RWKV-LM/blob/cca1b5e8e597cf40675882bb10b46287c844e35c/RWKV-v4/src/model.py#L21
+class L2Wrap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, loss, y):
+        ctx.save_for_backward(y)
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        y = ctx.saved_tensors[0]
+        # to encourage the logits to be close to 0
+        factor = 1e-4 / (y.shape[0] * y.shape[1])
+        maxx, ids = torch.max(y, -1, keepdim=True)
+        gy = torch.zeros_like(y)
+        gy.scatter_(-1, ids, maxx * factor)
+        return (grad_output, gy)
+class ChannelMixing(nn.Module):
+    def __init__(self,config,layer_id):
+        super().__init__()
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.layer_id = layer_id
+        n_embd = config.n_embd
+        intermediate_size = (
+            config.intermediate_size if config.intermediate_size is not None else 4 * n_embd
+        )
+        ## Learnable Matrix
+        self.key_proj        = nn.Linear(n_embd,intermediate_size,bias=False)
+        self.value_proj      = nn.Linear(intermediate_size,n_embd,bias=False)
+        self.receptance_proj = nn.Linear(n_embd,n_embd,bias=False)
+        ## Learnable Vector
+        self.time_mix_key        = nn.Parameter(torch.empty(1, 1, n_embd))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, n_embd))
+    def forward(self,x,state=None):
+        # x = (Batch,Time,Channel)
+        if state is not None:
+            prev_x = state[self.layer_id,:,[PREV_X_CHANNEL],:]
+            state[self.layer_id,:,[PREV_X_CHANNEL],:] = x
+        else:
+            prev_x = self.time_shift(x)
+        ## R
+        receptance = x * self.time_mix_receptance + prev_x * (1 - self.time_mix_receptance)
+        receptance = self.receptance_proj(receptance)
+        receptance = F.sigmoid(receptance)
+        # K
+        key = x * self.time_mix_key + prev_x * (1 - self.time_mix_key)
+        key = self.key_proj(key)
+        # V
+        value = self.value_proj(torch.square(torch.relu(key)))
+        ## output
+        out = receptance * value
+        return out, state
+class TimeMixing(nn.Module):
+    def __init__(self,config,layer_id):
+        super().__init__()
+        self.config = config
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.layer_id = layer_id
+        n_embd = config.n_embd
+        attn_sz = n_embd
+        ## learnable matrix
+        self.key_proj        = nn.Linear(n_embd, attn_sz, bias=False)
+        self.value_proj      = nn.Linear(n_embd, attn_sz, bias=False)
+        self.receptance_proj = nn.Linear(n_embd, attn_sz, bias=False)
+        self.output_proj     = nn.Linear(attn_sz, n_embd, bias=False)
+        ## learnable vector
+        self.time_decay          = nn.Parameter(torch.empty(attn_sz))
+        self.time_first          = nn.Parameter(torch.empty(attn_sz))
+        self.time_mix_key        = nn.Parameter(torch.empty(1, 1, n_embd))
+        self.time_mix_value      = nn.Parameter(torch.empty(1, 1, n_embd))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, n_embd))
+    def forward(self,x,state=None):
+        # x = (Batch,Time,Channel)
+        if state is not None:
+            prev_x = state[self.layer_id,:,[PREV_X_TIME],:]
+            state[self.layer_id,:,[PREV_X_TIME],:] = x
+        else:
+            prev_x = self.time_shift(x)
+        # K
+        key = x * self.time_mix_key + prev_x * (1 - self.time_mix_key)
+        key = self.key_proj(key)
+        # V
+        value = x * self.time_mix_value + prev_x * (1 - self.time_mix_value)
+        value = self.value_proj(value)
+        # R
+        receptance = x * self.time_mix_receptance + prev_x * (1 - self.time_mix_receptance)
+        receptance = self.receptance_proj(receptance)
+        receptance = F.sigmoid(receptance)
+        # WKV
+        wkv, state  = self.wkv_function(key,value,use_customized_cuda_kernel=self.config.use_customized_cuda_kernel,state=state)
+        # RWKV
+        rwkv = receptance * wkv
+        rwkv = self.output_proj(rwkv)
+        return rwkv, state
+    def wkv_function(self,key,value,use_customized_cuda_kernel,state=None):
+        ## essentially, this customized cuda kernel delivers a faster for loop across time steps
+        ## only for training and evaluating loss and ppl
+        if state is None and use_customized_cuda_kernel:
+            B, T, C = key.size()
+            return WKVKernel.apply(B, T, C, self.time_decay, self.time_first, key, value), None
+        ## raw wkv function (from Huggingface Implementation)
+        ## only for generation (because using raw pytorch for loop to train the model would be super super slow)
+        else:
+            _, seq_length, _ = key.size()
+            output = torch.zeros_like(key)
+            debug_mode = False
+            if state is None:
+                ## only for debug purpose when use_customized_cuda_kernel=False and state is None
+                debug_mode = True
+                num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+                den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+                max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
+            else:
+                num_state  = state[self.layer_id,:,NUM_STATE,:]
+                den_state  = state[self.layer_id,:,DEN_STATE,:]
+                max_state  = state[self.layer_id,:,MAX_STATE,:]
+            time_decay = -torch.exp(self.time_decay)
+            for current_index in range(seq_length):
+                current_key = key[:, current_index].float()
+                current_value = value[:, current_index]
+                # wkv computation at time t
+                max_for_output = torch.maximum(max_state, current_key + self.time_first)
+                e1 = torch.exp(max_state - max_for_output)
+                e2 = torch.exp(current_key + self.time_first - max_for_output)
+                numerator = e1 * num_state + e2 * current_value
+                denominator = e1 * den_state + e2
+                output[:, current_index] = (numerator / denominator).to(output.dtype)
+                # Update state for next iteration
+                max_for_state = torch.maximum(max_state + time_decay, current_key)
+                e1 = torch.exp(max_state + time_decay - max_for_state)
+                e2 = torch.exp(current_key - max_for_state)
+                num_state = e1 * num_state + e2 * current_value
+                den_state = e1 * den_state + e2
+                max_state = max_for_state
+            if debug_mode:
+                return output, None
+            else:
+                state[self.layer_id,:,NUM_STATE,:] = num_state
+                state[self.layer_id,:,DEN_STATE,:] = den_state
+                state[self.layer_id,:,MAX_STATE,:] = max_state
+                return output, state
+class Block(nn.Module):
+    def __init__(self, config,layer_id):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = TimeMixing(config,layer_id)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.ffn = ChannelMixing(config,layer_id)
+    def forward(self, x, state = None):
+        # state: [batch_size, 5 , n_embd]
+        # time mixing
+        residual = x
+        x,state = self.attn(self.ln_1(x),state=state)
+        x = x + residual
+        # channel mixing
+        residual = x
+        x, state = self.ffn(self.ln_2(x),state=state)
+        x = x + residual
+        return x, state
+@dataclass
+class RWKVConfig:
+    block_size: int = 1024 # same as nanoGPT
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_embd: int = 768
+    bias: bool = True # bias in LayerNorms, in RWKV, all bias in Linear is False
+    intermediate_size: int = None # intermediate_size in channel-mixing
+    use_customized_cuda_kernel: bool = True
+    dtype: str = "float16" ## bfloat16 is not supported in V100
+    rescale_every: int = 6 ## mysterious trick, only applies when inference
+class RWKV(nn.Module):
+    def __init__(self, config,lr_init=0.0008):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.lr_init = lr_init ## used to initialize embedding parameters
+        self.rwkv = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            ln_p = LayerNorm(config.n_embd, bias=config.bias),
+            h = nn.ModuleList([Block(config,layer_id) for layer_id in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.apply(self._init_weights)
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+        if self.config.use_customized_cuda_kernel:
+            ## load customized cuda kernel
+            self.load_cuda_kernel(config.dtype)
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the token embeddings get subtracted.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.rwkv.wte.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        ## initialize Vector Parameters in TimeMixing
+        if isinstance(module,TimeMixing):
+            layer_id = module.layer_id
+            n_layer = self.config.n_layer
+            n_embd = self.config.n_embd
+            attn_sz = n_embd
+            with torch.no_grad():
+                ratio_0_to_1 = layer_id / (n_layer - 1)  # 0 to 1
+                ratio_1_to_almost0 = 1.0 - (layer_id / n_layer)  # 1 to ~0
+                ddd = torch.ones(1, 1, n_embd)
+                for i in range(n_embd):
+                    ddd[0, 0, i] = i / n_embd
+                decay_speed = torch.ones(attn_sz)
+                for h in range(attn_sz):
+                    decay_speed[h] = -5 + 8 * (h / (attn_sz - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+                module.time_decay = nn.Parameter(decay_speed)
+                zigzag = torch.tensor([(i + 1) % 3 - 1 for i in range(attn_sz)]) * 0.5
+                module.time_first = nn.Parameter(torch.ones(attn_sz) * math.log(0.3) + zigzag)
+                module.time_mix_key = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))
+                module.time_mix_value = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)
+                module.time_mix_receptance = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))
+        ## initialize Vector Parameters in ChannelMixing
+        elif isinstance(module,ChannelMixing):
+            layer_id = module.layer_id
+            n_layer = self.config.n_layer
+            n_embd = self.config.n_embd
+            with torch.no_grad():  # fancy init of time_mix
+                ratio_1_to_almost0 = 1.0 - (layer_id / n_layer)  # 1 to ~0
+                ddd = torch.ones(1, 1, n_embd)
+                for i in range(n_embd):
+                    ddd[0, 0, i] = i / n_embd
+                module.time_mix_key = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))
+                module.time_mix_receptance = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))
+        ## initialize Linear Layer and Embedding Layer
+        elif isinstance(module,(nn.Embedding,nn.Linear)):
+            weight = module.weight
+            shape = weight.shape
+            gain = 1.0
+            scale = 1.0
+            ## get the current name of the parameters
+            for _name,_parameters in self.named_parameters():
+                if id(_parameters) == id(weight):
+                    current_module_name = _name
+            # print(current_module_name)
+            ## Embedding
+            if isinstance(module, nn.Embedding):
+                gain = math.sqrt(max(shape[0], shape[1]))
+                scale = -1 * self.lr_init
+            ## Linear
+            elif isinstance(module,nn.Linear):
+                if shape[0] > shape[1]:
+                    gain = math.sqrt(shape[0] / shape[1])
+                ## initialize some matrix to be all ZEROS
+                for name in [".attn.key_proj.", ".attn.receptance_proj.", ".attn.output_proj.",
+                             ".ffn.value_proj.", ".ffn.receptance_proj."]:
+                    if name in current_module_name:
+                        scale = 0
+                if current_module_name == 'lm_head.weight':
+                    scale = 0.5
+            if scale == 0:
+                nn.init.zeros_(weight)
+            elif scale < 0:
+                nn.init.uniform_(weight, a=scale, b=-scale)
+            else:
+                nn.init.orthogonal_(weight, gain=gain * scale)
+    def forward(self, idx, targets=None, state=None, return_state=False):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        x = self.rwkv.wte(idx)
+        x = self.rwkv.ln_p(x)
+        # x = self.rwkv.drop(x)
+        for block_idx,block in enumerate(self.rwkv.h):
+            x, state = block(x,state)
+            if state is not None: ## in generation mode
+                if (
+                    self.config.rescale_every > 0
+                    and (block_idx + 1) % self.config.rescale_every == 0
+                ):
+                    x = x/2
+        x = self.rwkv.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+            if self.training:
+                loss = L2Wrap.apply(loss,logits) # from RWKV-LM
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+        if return_state:
+            return logits, loss, state
+        else:
+            return logits, loss
+    def crop_block_size(self, block_size):
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+    @classmethod
+    def from_pretrained(cls, model_type,use_customized_cuda_kernel=True,dtype="float16"):
+        assert model_type in {
+            'RWKV/rwkv-4-169m-pile',
+            "RWKV/rwkv-4-430m-pile",
+            "RWKV/rwkv-4-1b5-pile",
+            "RWKV/rwkv-4-3b-pile",
+            "RWKV/rwkv-4-7b-pile",
+            "RWKV/rwkv-raven-7b",
+            "RWKV/rwkv-raven-1b5",
+            "RWKV/rwkv-raven-3b",
+            "RWKV/rwkv-4-14b-pile",
+            }
+        print("loading weights from pretrained RWKV: %s" % model_type)
+        # init a huggingface/transformers model
+        from transformers import RwkvForCausalLM,RwkvConfig
+        hf_config = RwkvConfig.from_pretrained(model_type)
+        with CudaNotAvailable(): ## avoid HF load kernel
+            hf_model = RwkvForCausalLM.from_pretrained(model_type)
+        # create a from-scratch initialized RWKV model
+        config = {
+            "vocab_size":50277,
+            "n_layer":hf_config.num_hidden_layers,
+            "n_embd":hf_config.hidden_size,
+            "intermediate_size":hf_config.intermediate_size,
+            "use_customized_cuda_kernel":use_customized_cuda_kernel,
+            "dtype": dtype,
+        }
+        config = RWKVConfig(**config)
+        model = RWKV(config)
+        num_layers = config.n_layer
+        ## create mapping from the parameter name in RWKV to that of HF-RWKV
+        mapping = {
+            "rwkv.wte.weight":"rwkv.embeddings.weight",
+            "rwkv.ln_p.weight":"rwkv.blocks.0.pre_ln.weight",
+            "rwkv.ln_p.bias":"rwkv.blocks.0.pre_ln.bias",
+            "rwkv.ln_f.weight":"rwkv.ln_out.weight",
+            "rwkv.ln_f.bias":"rwkv.ln_out.bias",
+            "lm_head.weight":"head.weight",
+            **{f"rwkv.h.{layer_id}.ln_{norm_id}.weight":f"rwkv.blocks.{layer_id}.ln{norm_id}.weight" for layer_id in range(num_layers) for norm_id in [1,2]},
+            **{f"rwkv.h.{layer_id}.ln_{norm_id}.bias":f"rwkv.blocks.{layer_id}.ln{norm_id}.bias" for layer_id in range(num_layers) for norm_id in [1,2]},
+            **{f"rwkv.h.{layer_id}.attn.{_type}":f"rwkv.blocks.{layer_id}.attention.{_type}" for layer_id in range(num_layers) for _type in ["time_decay","time_first",'time_mix_key','time_mix_value',"time_mix_receptance"]},
+            **{f"rwkv.h.{layer_id}.attn.{_type}_proj.weight":f"rwkv.blocks.{layer_id}.attention.{_type}.weight" for layer_id in range(num_layers) for _type in ["key","value",'receptance',"output"]},
+            **{f"rwkv.h.{layer_id}.ffn.{_type}":f"rwkv.blocks.{layer_id}.feed_forward.{_type}" for layer_id in range(num_layers) for _type in ['time_mix_key',"time_mix_receptance"]},
+            **{f"rwkv.h.{layer_id}.ffn.{_type}_proj.weight":f"rwkv.blocks.{layer_id}.feed_forward.{_type}.weight" for layer_id in range(num_layers) for _type in ["key","value",'receptance']},
+        }
+        mapped_set = [mapping[x] for x in model.state_dict().keys()]
+        assert set(mapped_set) == set(hf_model.state_dict().keys())
+        sd = model.state_dict()
+        hf_sd = hf_model.state_dict()
+        for k1,k2 in mapping.items():
+            assert sd[k1].shape == hf_sd[k2].shape,(k1,k2)
+            sd[k1].copy_(hf_sd[k2])
+        return model
+    # def configure_optimizers(self,weight_decay,learning_rate,betas,device_type):
+    #     # lr_1x = set()
+    #     # lr_2x = set()
+    #     # lr_3x = set()
+    #     # for n, p in self.named_parameters():
+    #     #     if "time_mix" in n:lr_1x.add(n)
+    #     #     elif "time_decay" in n:lr_2x.add(n)
+    #     #     elif "time_first" in n:lr_3x.add(n)
+    #     #     else:lr_1x.add(n)
+    #     # lr_1x = sorted(list(lr_1x))
+    #     # lr_2x = sorted(list(lr_2x))
+    #     # lr_3x = sorted(list(lr_3x))
+    #     # param_dict = {n: p for n, p in self.named_parameters()}
+    #     # optim_groups = [
+    #     #     {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "my_lr_scale": 1.0},
+    #     #     {"params": [param_dict[n] for n in lr_2x], "weight_decay": 0.0, "my_lr_scale": 2.0},
+    #     #     {"params": [param_dict[n] for n in lr_3x], "weight_decay": 0.0, "my_lr_scale": 3.0},
+    #     # ]
+    #     optim_groups = [{"params": [p for n, p in self.named_parameters()], "weight_decay": 0.0},]
+    #     fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+    #     use_fused = fused_available and device_type == 'cuda'
+    #     extra_args = dict(fused=True) if use_fused else dict()
+    #     optimizer = torch.optim.Adam(optim_groups, lr=learning_rate, betas=betas, eps=1e-8, weight_decay=weight_decay,amsgrad=False,**extra_args)
+    #     return optimizer
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see RWKV paper Appendix C as ref: https://arxiv.org/abs/2305.13048
+        cfg = self.config
+        L, V, D = cfg.n_layer, cfg.vocab_size, cfg.n_embd
+        # Note there is a typo in the RWKV paper. Forward pass is 2*fn, forward
+        # and backward is 6*fn.
+        flops_per_token = 2*(V*D + 13*(V**2)*L)
+        flops_per_fwdbwd = 3*flops_per_token
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet.pdf
+        if cfg.dtype == 'bfloat16':
+            flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        elif cfg.dtype == 'float16':
+            flops_promised = 312e12 # A100 GPU float16 peak flops is 312 TFLOPS
+        else: #dtype == float32
+            flops_promised = 19.5e12 # A100 GPU float32 peak flops is 19.5 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+    def init_state(self,batch_size,device):
+        n_state = len([PREV_X_TIME,NUM_STATE,DEN_STATE,MAX_STATE,PREV_X_CHANNEL])
+        state = torch.zeros(
+            (self.config.n_layer,batch_size,n_state,self.config.n_embd),
+            dtype=torch.float32, device=device,
+        )
+        state[:,:,MAX_STATE,:] -= 1e30
+        return state
+    def scale_parameters(self):
+        if self.config.rescale_every > 0:
+            with torch.no_grad():
+                for block_id,block in enumerate(self.rwkv.h):
+                    block.attn.output_proj.weight.div_(2 ** int(block_id // self.config.rescale_every))
+                    block.ffn.value_proj.weight.div_(2 ** int(block_id // self.config.rescale_every))
+            self.scaled = True
+    def unscale_parameters(self):
+        if self.config.rescale_every > 0 and self.scaled:
+            with torch.no_grad():
+                for block_id,block in enumerate(self.rwkv.h):
+                    block.attn.output_proj.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+                    block.ffn.value_proj.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        idx: (batch_size,seq_len)
+        """
+        batch_size,seq_len = idx.shape
+        state = self.init_state(batch_size,idx.device)
+        for seq_id in range(seq_len):
+            logits, _, state = self(idx[:,[seq_id]], state = state, return_state=True)
+        for _ in range(max_new_tokens):
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+            logits, _, state = self(idx_next, state=state, return_state=True)
+        return idx
+    def load_cuda_kernel(self,dtype):
+        from torch.utils.cpp_extension import load
+        T_MAX = self.config.block_size
+        RWKV_FLOAT_MODE = dtype
+        if RWKV_FLOAT_MODE == "bfloat16":
+            wkv_cuda = load(name=f"wkv_{T_MAX}_bf16", sources=["cuda/wkv_op_bf16.cpp", "cuda/wkv_cuda_bf16.cu"], verbose=True, extra_cuda_cflags=["-t 4", "-std=c++17", "-res-usage", "--maxrregcount 60", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization", f"-DTmax={T_MAX}"])
+            class WKV(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, B, T, C, w, u, k, v):
+                    ctx.B = B
+                    ctx.T = T
+                    ctx.C = C
+                    assert T <= T_MAX
+                    assert B * C % min(C, 32) == 0
+                    w = -torch.exp(w.float().contiguous())
+                    u = u.contiguous().bfloat16()
+                    k = k.contiguous()
+                    v = v.contiguous()
+                    y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)
+                    wkv_cuda.forward(B, T, C, w, u, k, v, y)
+                    ctx.save_for_backward(w, u, k, v, y)
+                    return y
+                @staticmethod
+                def backward(ctx, gy):
+                    B = ctx.B
+                    T = ctx.T
+                    C = ctx.C
+                    assert T <= T_MAX
+                    assert B * C % min(C, 32) == 0
+                    w, u, k, v, y = ctx.saved_tensors
+                    gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)
+                    gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)
+                    gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)
+                    gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)
+                    wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)
+                    gw = torch.sum(gw, dim=0)
+                    gu = torch.sum(gu, dim=0)
+                    return (None, None, None, gw, gu, gk, gv)
+        else:
+            wkv_cuda = load(name=f"wkv_{T_MAX}", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"], verbose=True, extra_cuda_cflags=["-res-usage", "--maxrregcount 60", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization", f"-DTmax={T_MAX}"])
+            class WKV(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, B, T, C, w, u, k, v):
+                    ctx.B = B
+                    ctx.T = T
+                    ctx.C = C
+                    assert T <= T_MAX
+                    assert B * C % min(C, 32) == 0
+                    if "32" in RWKV_FLOAT_MODE:
+                        w = -torch.exp(w.contiguous())
+                        u = u.contiguous()
+                        k = k.contiguous()
+                        v = v.contiguous()
+                    else:
+                        w = -torch.exp(w.float().contiguous())
+                        u = u.float().contiguous()
+                        k = k.float().contiguous()
+                        v = v.float().contiguous()
+                    y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format)
+                    wkv_cuda.forward(B, T, C, w, u, k, v, y)
+                    ctx.save_for_backward(w, u, k, v, y)
+                    if "32" in RWKV_FLOAT_MODE:
+                        return y
+                    elif RWKV_FLOAT_MODE == "float16":
+                        return y.half()
+                @staticmethod
+                def backward(ctx, gy):
+                    B = ctx.B
+                    T = ctx.T
+                    C = ctx.C
+                    assert T <= T_MAX
+                    assert B * C % min(C, 32) == 0
+                    w, u, k, v, y = ctx.saved_tensors
+                    gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)
+                    gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)
+                    gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)
+                    gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)
+                    if "32" in RWKV_FLOAT_MODE:
+                        wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)
+                    else:
+                        wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.float().contiguous(), gw, gu, gk, gv)
+                    gw = torch.sum(gw, dim=0)
+                    gu = torch.sum(gu, dim=0)
+                    if "32" in RWKV_FLOAT_MODE:
+                        return (None, None, None, gw, gu, gk, gv)
+                    elif RWKV_FLOAT_MODE == "float16":
+                        return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
+        global WKVKernel
+        WKVKernel = WKV

out/.keep ADDED Viewed

File without changes

sample.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Sample from a trained model
+"""
+import os
+import pickle
+from contextlib import nullcontext
+import torch
+import tiktoken
+from modeling_gpt import GPTConfig, GPT
+from modeling_rwkv import RWKV,RWKVConfig
+# -----------------------------------------------------------------------------
+init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
+out_dir = 'out' # ignored if init_from is not 'resume'
+start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+num_samples = 10 # number of samples to draw
+max_new_tokens = 500 # number of tokens generated in each sample
+temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
+seed = 1337
+device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
+compile = False # use PyTorch 2.0 to compile the model to be faster
+exec(open('configurator.py').read()) # overrides from command line or config file
+# -----------------------------------------------------------------------------
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# model
+if init_from == 'resume':
+    # init from a model saved in a specific directory
+    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = GPTConfig(**checkpoint['model_args'])
+    model = GPT(gptconf)
+    state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+elif init_from.startswith('gpt2'):
+    # init from a given GPT-2 model
+    model = GPT.from_pretrained(init_from, dict(dropout=0.0))
+elif init_from.startswith("RWKV"):
+    model = RWKV.from_pretrained(init_from,use_customized_cuda_kernel=False,dtype=dtype)
+    model.scale_parameters()
+model.eval()
+model.to(device)
+if compile:
+    model = torch.compile(model) # requires PyTorch 2.0 (optional)
+# look for the meta pickle in case it is available in the dataset folder
+load_meta = False
+if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
+    meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
+    load_meta = os.path.exists(meta_path)
+if load_meta:
+    print(f"Loading meta from {meta_path}...")
+    with open(meta_path, 'rb') as f:
+        meta = pickle.load(f)
+    # TODO want to make this more general to arbitrary encoder/decoder schemes
+    stoi, itos = meta['stoi'], meta['itos']
+    encode = lambda s: [stoi[c] for c in s]
+    decode = lambda l: ''.join([itos[i] for i in l])
+elif init_from.startswith("gpt2"):
+    # ok let's assume gpt-2 encodings by default
+    print("No meta.pkl found, assuming GPT-2 encodings...")
+    enc = tiktoken.get_encoding("gpt2")
+    encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
+    decode = lambda l: enc.decode(l)
+elif init_from.startswith("RWKV"):
+    print("No meta.pkl found, assuming RWKV encodings...")
+    from transformers import AutoTokenizer
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    toker = AutoTokenizer.from_pretrained(init_from)
+    encode = lambda s:toker.encode(s)
+    decode = lambda s:toker.decode(s)
+# encode the beginning of the prompt
+if start.startswith('FILE:'):
+    with open(start[5:], 'r', encoding='utf-8') as f:
+        start = f.read()
+start_ids = encode(start)
+x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+# x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...].repeat(12,1)
+# run generation
+with torch.no_grad():
+    with ctx:
+        for k in range(num_samples):
+            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+            print(decode(y[0].tolist()))
+            print('---------------')

scaling_laws.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

train.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+This training script can be run both on a single gpu in debug mode,
+and also in a larger training run with distributed data parallel (ddp).
+To run on a single GPU, example:
+$ python train.py --batch_size=32 --compile=False
+To run with DDP on 4 gpus on 1 node, example:
+$ torchrun --standalone --nproc_per_node=4 train.py
+To run with DDP on 4 gpus across 2 nodes, example:
+- Run on the first (master) node with example IP 123.456.123.456:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
+- Run on the worker node:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
+(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
+"""
+import os
+import time
+import math,json
+import pickle
+from contextlib import nullcontext
+import tiktoken
+import numpy as np
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+from modeling_gpt import GPTConfig, GPT
+from modeling_rwkv import RWKVConfig,RWKV
+from transformers import AutoTokenizer
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# -----------------------------------------------------------------------------
+# default config values designed to train a gpt2 (124M) on OpenWebText
+# I/O
+out_dir = 'out'
+eval_interval = 2000
+log_interval = 1
+eval_iters = 200
+eval_only = False # if True, script exits right after the first eval
+always_save_checkpoint = True # if True, always save a checkpoint after each eval
+init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
+# wandb logging
+wandb_log = False # disabled by default
+wandb_project = 'owt'
+wandb_run_name = 'gpt2' # 'run' + str(time.time())
+# data
+dataset = 'openwebtext'
+gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
+batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
+block_size = 1024
+# model
+n_layer = 12
+n_head = 12
+n_embd = 768
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+# adamw optimizer
+learning_rate = 6e-4 # max learning rate
+max_iters = 600000 # total number of training iterations
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
+# learning rate decay settings
+decay_lr = True # whether to decay the learning rate
+warmup_iters = 2000 # how many steps to warm up for
+lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
+min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+# DDP settings
+backend = 'nccl' # 'nccl', 'gloo', etc.
+# system
+device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+compile = True # use PyTorch 2.0 to compile the model to be faster
+# model
+model_type = 'gpt'
+use_customized_cuda_kernel = True
+# -----------------------------------------------------------------------------
+config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
+exec(open('configurator.py').read()) # overrides from command line or config file
+config = {k: globals()[k] for k in config_keys} # will be useful for logging
+print(json.dumps(config,indent=4))
+# -----------------------------------------------------------------------------
+# various inits, derived attributes, I/O setup
+ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
+if ddp:
+    init_process_group(backend=backend)
+    ddp_rank = int(os.environ['RANK'])
+    ddp_local_rank = int(os.environ['LOCAL_RANK'])
+    ddp_world_size = int(os.environ['WORLD_SIZE'])
+    device = f'cuda:{ddp_local_rank}'
+    torch.cuda.set_device(device)
+    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
+    seed_offset = ddp_rank # each process gets a different seed
+    # world_size number of processes will be training simultaneously, so we can scale
+    # down the desired gradient accumulation iterations per process proportionally
+    assert gradient_accumulation_steps % ddp_world_size == 0
+    gradient_accumulation_steps //= ddp_world_size
+else:
+    # if not ddp, we are running on a single gpu, and one process
+    master_process = True
+    seed_offset = 0
+    ddp_world_size = 1
+tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
+print(f"tokens per iteration will be: {tokens_per_iter:,}")
+if master_process:
+    os.makedirs(out_dir, exist_ok=True)
+torch.manual_seed(1337 + seed_offset)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+# note: float16 data type will automatically use a GradScaler
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# poor man's data loader
+data_dir = os.path.join('data', dataset)
+train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
+def get_batch(split):
+    data = train_data if split == 'train' else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = [torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix]
+    y = [torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix]
+    x = torch.stack(x)
+    y = torch.stack(y)
+    if device_type == 'cuda':
+        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+    else:
+        x, y = x.to(device), y.to(device)
+    return x, y
+# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
+iter_num = 0
+best_val_loss = 1e9
+# attempt to derive vocab_size from the dataset
+meta_path = os.path.join(data_dir, 'meta.pkl')
+meta_vocab_size = None
+if os.path.exists(meta_path):
+    with open(meta_path, 'rb') as f:
+        meta = pickle.load(f)
+    meta_vocab_size = meta['vocab_size']
+    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
+# model init
+if model_type == 'gpt':
+    LLM = GPT
+    LLMConfig = GPTConfig
+    model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
+                  bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
+elif model_type == 'rwkv':
+    LLM = RWKV
+    LLMConfig = RWKVConfig
+    model_args = dict(n_layer=n_layer, n_embd=n_embd, block_size=block_size,
+                  bias=bias, vocab_size=None, dtype=dtype,use_customized_cuda_kernel=use_customized_cuda_kernel) # start with model_args from command line
+if init_from == 'scratch':
+    # init a new model from scratch
+    print("Initializing a new model from scratch")
+    # determine the vocab size we'll use for from-scratch training
+    if meta_vocab_size is None:
+        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
+    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
+    model = LLM(LLMConfig(**model_args))
+elif init_from == 'resume':
+    print(f"Resuming training from {out_dir}")
+    # resume training from a checkpoint.
+    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    checkpoint_model_args = checkpoint['model_args']
+    # force these config attributes to be equal otherwise we can't even resume training
+    # the rest of the attributes (e.g. dropout) can stay as desired from command line
+    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
+        model_args[k] = checkpoint_model_args[k]
+    # create the model
+    gptconf = GPTConfig(**model_args)
+    model = GPT(gptconf)
+    state_dict = checkpoint['model']
+    # fix the keys of the state dictionary :(
+    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+    iter_num = checkpoint['iter_num']
+    best_val_loss = checkpoint['best_val_loss']
+elif init_from.startswith('gpt2'):
+    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
+    # initialize from OpenAI GPT-2 weights
+    override_args = dict(dropout=dropout)
+    model = GPT.from_pretrained(init_from, override_args)
+    # read off the created config params, so we can store them into checkpoint correctly
+    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
+        model_args[k] = getattr(model.config, k)
+elif init_from.startswith('RWKV'):
+    model = RWKV.from_pretrained(init_from,dtype=dtype,use_customized_cuda_kernel=use_customized_cuda_kernel)
+    enc = tiktoken.get_encoding("gpt2")
+    val_data_text = enc.decode(val_data)
+    toker = AutoTokenizer.from_pretrained(init_from)
+    val_data_rwkv = np.array(toker.encode(val_data_text))
+    val_data = val_data_rwkv
+# crop down the model block size if desired, using model surgery
+if block_size < model.config.block_size:
+    model.crop_block_size(block_size)
+    model_args['block_size'] = block_size # so that the checkpoint will have the right value
+model.to(device)
+# initialize a GradScaler. If enabled=False scaler is a no-op
+scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
+# optimizer
+optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
+if init_from == 'resume':
+    optimizer.load_state_dict(checkpoint['optimizer'])
+checkpoint = None # free up memory
+# compile the model
+if compile:
+    print("compiling the model... (takes a ~minute)")
+    unoptimized_model = model
+    model = torch.compile(model) # requires PyTorch 2.0
+# wrap model into DDP container
+if ddp:
+    model = DDP(model, device_ids=[ddp_local_rank])
+# helps estimate an arbitrarily accurate loss over either split using many batches
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            with ctx:
+                logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+# learning rate decay scheduler (cosine with warmup)
+def get_lr(it):
+    # 1) linear warmup for warmup_iters steps
+    if it < warmup_iters:
+        return learning_rate * it / warmup_iters
+    # 2) if it > lr_decay_iters, return min learning rate
+    if it > lr_decay_iters:
+        return min_lr
+    # 3) in between, use cosine decay down to min learning rate
+    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    assert 0 <= decay_ratio <= 1
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
+    return min_lr + coeff * (learning_rate - min_lr)
+# logging
+if wandb_log and master_process:
+    import wandb
+    wandb.init(project=wandb_project, name=wandb_run_name, config=config)
+# training loop
+X, Y = get_batch('train') # fetch the very first batch
+t0 = time.time()
+local_iter_num = 0 # number of iterations in the lifetime of this process
+raw_model = model.module if ddp else model # unwrap DDP container if needed
+running_mfu = -1.0
+while True:
+    # determine and set the learning rate for this iteration
+    lr = get_lr(iter_num) if decay_lr else learning_rate
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    # evaluate the loss on train/val sets and write checkpoints
+    if iter_num % eval_interval == 0 and master_process:
+        losses = estimate_loss()
+        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        if wandb_log:
+            wandb.log({
+                "iter": iter_num,
+                "train/loss": losses['train'],
+                "val/loss": losses['val'],
+                "lr": lr,
+                "mfu": running_mfu*100, # convert to percentage
+            })
+        if losses['val'] < best_val_loss or always_save_checkpoint:
+            best_val_loss = losses['val']
+            if iter_num > 0:
+                checkpoint = {
+                    'model': raw_model.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'model_args': model_args,
+                    'iter_num': iter_num,
+                    'best_val_loss': best_val_loss,
+                    'config': config,
+                }
+                print(f"saving checkpoint to {out_dir}")
+                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
+    if iter_num == 0 and eval_only:
+        break
+    # forward backward update, with optional gradient accumulation to simulate larger batch size
+    # and using the GradScaler if data type is float16
+    for micro_step in range(gradient_accumulation_steps):
+        if ddp:
+            # in DDP training we only need to sync gradients at the last micro step.
+            # the official way to do this is with model.no_sync() context manager, but
+            # I really dislike that this bloats the code and forces us to repeat code
+            # looking at the source of that context manager, it just toggles this variable
+            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
+        with ctx:
+            logits, loss = model(X, Y)
+            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
+        # immediately async prefetch next batch while model is doing the forward pass on the GPU
+        X, Y = get_batch('train')
+        # backward pass, with gradient scaling if training in fp16
+        scaler.scale(loss).backward()
+    # clip the gradient
+    if grad_clip != 0.0:
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+    # step the optimizer and scaler if training in fp16
+    scaler.step(optimizer)
+    scaler.update()
+    # flush the gradients as soon as we can, no need for this memory anymore
+    optimizer.zero_grad(set_to_none=True)
+    # timing and logging
+    t1 = time.time()
+    dt = t1 - t0
+    t0 = t1
+    if iter_num % log_interval == 0 and master_process:
+        # get loss as float. note: this is a CPU-GPU sync point
+        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
+        lossf = loss.item() * gradient_accumulation_steps
+        if local_iter_num >= 5: # let the training loop settle a bit
+            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
+            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
+        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
+    iter_num += 1
+    local_iter_num += 1
+    # termination conditions
+    if iter_num > max_iters:
+        break
+if ddp:
+    destroy_process_group()

transformer_sizing.ipynb ADDED Viewed

	@@ -0,0 +1,402 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Transformer Theoretical Model\n",
+    "\n",
+    "This notebook stores a bunch of analysis about a Transformer, e.g. estimates the number of FLOPs, parameters, peak memory footprint, checkpoint size, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import OrderedDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config_args = {\n",
+    "#     'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params\n",
+    "#     'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params\n",
+    "#     'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params\n",
+    "#     'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params\n",
+    "# }[model_type]\n",
+    "\n",
+    "block_size = 1024\n",
+    "vocab_size = 50257\n",
+    "n_layer = 12\n",
+    "n_head = 12\n",
+    "n_embd = 768\n",
+    "bias = False\n",
+    "assert not bias, \"this notebook assumes bias=False just for simplicity\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "we see: 124337664, expected: 124337664, match: True\n",
+      "name                 params     ratio (%) \n",
+      "emebedding/position      786432     0.6325\n",
+      "embedding/token        38597376    31.0424\n",
+      "embedding              39383808    31.6749\n",
+      "attention/ln                768     0.0006\n",
+      "attention/kqv           1769472     1.4231\n",
+      "attention/proj           589824     0.4744\n",
+      "attention               2360064     1.8981\n",
+      "mlp/ln                      768     0.0006\n",
+      "mlp/ffw                 2359296     1.8975\n",
+      "mlp/proj                2359296     1.8975\n",
+      "mlp                     4719360     3.7956\n",
+      "block                   7079424     5.6937\n",
+      "transformer            84953088    68.3245\n",
+      "ln_f                        768     0.0006\n",
+      "dense                         0     0.0000\n",
+      "total                 124337664   100.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "def params():\n",
+    "    \"\"\" estimates the number of parameters in the model\"\"\"\n",
+    "    out = OrderedDict()\n",
+    "\n",
+    "    # token and position embeddings\n",
+    "    out['emebedding/position'] = n_embd * block_size\n",
+    "    out['embedding/token'] = n_embd * vocab_size\n",
+    "    out['embedding'] = out['emebedding/position'] + out['embedding/token']\n",
+    "\n",
+    "    # attention blocks\n",
+    "    out['attention/ln'] = n_embd # note, bias=False in our LN\n",
+    "    out['attention/kqv'] = n_embd * 3*n_embd\n",
+    "    out['attention/proj'] = n_embd**2\n",
+    "    out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']\n",
+    "\n",
+    "    # MLP blocks\n",
+    "    ffw_size = 4*n_embd # feed forward size\n",
+    "    out['mlp/ln'] = n_embd\n",
+    "    out['mlp/ffw'] = n_embd * ffw_size\n",
+    "    out['mlp/proj'] = ffw_size * n_embd\n",
+    "    out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']\n",
+    "    \n",
+    "    # the transformer and the rest of it\n",
+    "    out['block'] = out['attention'] + out['mlp']\n",
+    "    out['transformer'] = n_layer * out['block']\n",
+    "    out['ln_f'] = n_embd # final layernorm\n",
+    "    out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer\n",
+    "\n",
+    "    # total\n",
+    "    out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']\n",
+    "\n",
+    "    return out\n",
+    "\n",
+    "# compare our param count to that reported by PyTorch\n",
+    "p = params()\n",
+    "params_total = p['total']\n",
+    "print(f\"we see: {params_total}, expected: {124337664}, match: {params_total == 124337664}\")\n",
+    "# create a header\n",
+    "print(f\"{'name':20s} {'params':10s} {'ratio (%)':10s}\")\n",
+    "for k,v in p.items():\n",
+    "    print(f\"{k:20s} {v:10d} {v/params_total*100:10.4f}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "est checkpoint size: 1.49 GB\n",
+      "measured with wc -c ckpt.pt: 1542470366\n",
+      "fluff ratio: 103.38%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we can now calculate the size of each checkpoint\n",
+    "# params are stored in fp32, and the AdamW optimizer has 2 additional buffers per param for statistics\n",
+    "params_bytes = params_total*4\n",
+    "params_and_buffers_bytes = params_bytes + 2*params_bytes\n",
+    "print(f\"est checkpoint size: {params_and_buffers_bytes/1e9:.2f} GB\")\n",
+    "measured_bytes = 1542470366 # from wc -c ckpt.pt\n",
+    "print(f\"measured with wc -c ckpt.pt: {measured_bytes}\")\n",
+    "print(f\"fluff ratio: {measured_bytes/params_and_buffers_bytes*100:.2f}%\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also estimate the ratio of our GPU memory that will be taken up just by the weights and the buffers inside the AdamW optimizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "memory ratio taken up just for parameters: 3.73%\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpu_memory = 40e9 # 40 GB A100 GPU, roughly\n",
+    "print(f\"memory ratio taken up just for parameters: {params_and_buffers_bytes / gpu_memory * 100:.2f}%\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "i.e. not that much of the memory for this tiny model, most of the memory is activations (forward and backward). This of course changes dramatically for larger and larger models."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's estimate FLOPs for a single forward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "name                 flops          ratio (%) \n",
+      "attention/kqv            3623878656     1.2426\n",
+      "attention/scores         1610612736     0.5522\n",
+      "attention/reduce         1610612736     0.5522\n",
+      "attention/proj           1207959552     0.4142\n",
+      "attention                8053063680     2.7612\n",
+      "mlp/ffw1                 4831838208     1.6567\n",
+      "mlp/ffw2                 4831838208     1.6567\n",
+      "mlp                      9663676416     3.3135\n",
+      "block                   17716740096     6.0747\n",
+      "transformer            212600881152    72.8963\n",
+      "dense                   79047426048    27.1037\n",
+      "forward_total          291648307200   100.0000\n",
+      "backward_total         583296614400   200.0000\n",
+      "total                  874944921600   300.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "def flops():\n",
+    "    # we only count Weight FLOPs, all other layers (LayerNorm, Softmax, etc) are effectively irrelevant\n",
+    "    # we count actual FLOPs, not MACs. Hence 2* all over the place\n",
+    "    # basically for any matrix multiply A (BxC) @ B (CxD) -> (BxD) flops are 2*B*C*D\n",
+    "\n",
+    "    out = OrderedDict()\n",
+    "    head_size = n_embd // n_head\n",
+    "\n",
+    "    # attention blocks\n",
+    "    # 1) the projection to key, query, values\n",
+    "    out['attention/kqv'] = 2 * block_size * (n_embd * 3*n_embd)\n",
+    "    # 2) calculating the attention scores\n",
+    "    out['attention/scores'] = 2 * block_size * block_size * n_embd\n",
+    "    # 3) the reduction of the values (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n",
+    "    out['attention/reduce'] = 2 * n_head * (block_size * block_size * head_size)\n",
+    "    # 4) the final linear projection\n",
+    "    out['attention/proj'] = 2 * block_size * (n_embd * n_embd)\n",
+    "    out['attention'] = sum(out['attention/'+k] for k in ['kqv', 'scores', 'reduce', 'proj'])\n",
+    "\n",
+    "    # MLP blocks\n",
+    "    ffw_size = 4*n_embd # feed forward size\n",
+    "    out['mlp/ffw1'] = 2 * block_size * (n_embd * ffw_size)\n",
+    "    out['mlp/ffw2'] = 2 * block_size * (ffw_size * n_embd)\n",
+    "    out['mlp'] = out['mlp/ffw1'] + out['mlp/ffw2']\n",
+    "\n",
+    "    # the transformer and the rest of it\n",
+    "    out['block'] = out['attention'] + out['mlp']\n",
+    "    out['transformer'] = n_layer * out['block']\n",
+    "    out['dense'] = 2 * block_size * (n_embd * vocab_size)\n",
+    "\n",
+    "    # forward,backward,total\n",
+    "    out['forward_total'] = out['transformer'] + out['dense']\n",
+    "    out['backward_total'] = 2 * out['forward_total'] # use common estimate of bwd = 2*fwd\n",
+    "    out['total'] = out['forward_total'] + out['backward_total']\n",
+    "\n",
+    "    return out\n",
+    "    \n",
+    "# compare our param count to that reported by PyTorch\n",
+    "f = flops()\n",
+    "flops_total = f['forward_total']\n",
+    "print(f\"{'name':20s} {'flops':14s} {'ratio (%)':10s}\")\n",
+    "for k,v in f.items():\n",
+    "    print(f\"{k:20s} {v:14d} {v/flops_total*100:10.4f}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "palm_flops: 875062886400, flops: 874944921600, ratio: 1.0001\n"
+     ]
+    }
+   ],
+   "source": [
+    "# now here is an estimate copy pasted from the PaLM paper\n",
+    "# this formula is often used to calculate MFU (model flops utilization)\n",
+    "def palm_flops():\n",
+    "    \"\"\"estimate of the model flops following PaLM paper formula\"\"\"\n",
+    "    # non-embedding model parameters. note that we do not subtract the\n",
+    "    # embedding/token params because those are tied and get used in the last layer.\n",
+    "    N = params()['total'] - params()['emebedding/position']\n",
+    "    L, H, Q, T = n_layer, n_head, n_embd//n_head, block_size\n",
+    "    mf_per_token = 6*N + 12*L*H*Q*T\n",
+    "    mf = mf_per_token * block_size\n",
+    "    return mf\n",
+    "\n",
+    "print(f\"palm_flops: {palm_flops():d}, flops: {flops()['total']:d}, ratio: {palm_flops()/flops()['total']:.4f}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ok they are quite similar, giving some confidence that my math in flops() function was ~ok. Now, A100 is cited at 312TFLOPS bfloat16 on tensor cores. So what is our model flops utilization (MFU)? I trained the model above with a batch_size of 20 and grad_accum of 5, which runs in about 755ms on a single A100 GPU. We get:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fraction of A100 used: 37.14%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# here is what we currently roughly measure\n",
+    "batch_size = 20 * 5 # 5 is grad_accum, so total batch size is 100\n",
+    "measured_time = 0.755 # in seconds per iteration\n",
+    "measured_throughput = batch_size / measured_time\n",
+    "flops_achieved = f['total'] * measured_throughput\n",
+    "\n",
+    "# A100 is cited to be 312 TFLOPS of bloat16 running on tensor cores\n",
+    "a100_flops_promised = 312e12\n",
+    "\n",
+    "# the fraction of the A100 that we are using:\n",
+    "print(f\"fraction of A100 used: {flops_achieved / a100_flops_promised * 100:.2f}%\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For reference, we'd prefer to be somewhere around 50%+, and not just for a single GPU but for an entire DDP run. So we still have some work to do, but at least we're within a factor of ~2X of what is achievable with this GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "time needed to train the model: 3.46 days\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Finally let's check out the 6ND approximation as total cost of training in FLOPs\n",
+    "model_size = params()['total'] # this is number of parameters, N\n",
+    "tokens_num = 300e9 # 300B tokens, this is dataset size in tokens, D\n",
+    "a100_flops = 312e12 # 312 TFLOPS\n",
+    "assumed_mfu = 0.3 # assume this model flops utilization (take the current 37% from above and add some DDP overhead)\n",
+    "flops_throughput = a100_flops * 8 * assumed_mfu # assume an 8XA100 node at 30% utilization\n",
+    "flops_needed = 6 * model_size * tokens_num # 6ND\n",
+    "time_needed_s = flops_needed / flops_throughput # in seconds\n",
+    "print(f\"time needed to train the model: {time_needed_s/3600/24:.2f} days\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is not a bad estimate at all. I trained this model and it converged in roughly 4 days. Btw as a good reference for where 6ND comes from and some intuition around it I recommend [Dzmitry's post](https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, FLOPs are just one constraint, the other that we have to keep a close track of is the memory bandwidth. TODO estimate LOAD/STORE costs of our model later."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "7f5833218766b48e6e35e4452ee875aac0e2188d05bbe5298f2c62b79f08b222"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}