import os os.environ["TOKENIZERS_PARALLELISM"] = "false" from torch.profiler import ProfilerActivity, profile, record_function from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from torch import nn import torch torch.set_float32_matmul_precision('high') import json from argparse import ArgumentParser def sample(outputs): next_token_logits = outputs.logits[:, -1, :] probs = nn.functional.softmax(next_token_logits, dim=-1) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) return next_tokens if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--device",default='cuda') parser.add_argument("--model",required=True) parser.add_argument("--use_cache",action='store_true') parser.add_argument("--max_new_tokens",type=int,default=16_000) parser.add_argument("--output_path") args = parser.parse_args() prompt = 'hello' ## dummpy input config = AutoConfig.from_pretrained(args.model) config.max_position_embeddings = args.max_new_tokens+10 model = AutoModelForCausalLM.from_config(config) model.eval() model = model.to(args.device) model = torch.compile(model) model_size = sum(p.numel() for p in model.parameters()) tokenizer = AutoTokenizer.from_pretrained(args.model) tokenized_prompt = tokenizer(prompt, return_tensors="pt") tokenized_prompt = tokenized_prompt['input_ids'].to(args.device) model_input = { "input_ids":tokenized_prompt, "use_cache":args.use_cache, } cache_name = "state" if args.model.startswith("RWKV") else "past_key_values" model_input[cache_name]=None os.makedirs(os.path.dirname(args.output_path),exist_ok=True) writer = open(args.output_path,'w') for tok_idx in range(args.max_new_tokens): with torch.no_grad(): if args.use_cache and model_input[cache_name] is not None:model_input["input_ids"] = tokenized_prompt[:,-1:].to(args.device) else:model_input["input_ids"] = tokenized_prompt.to(args.device) with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=False) as prof: with record_function("model_inference"): output = model.forward(**model_input) model_input[cache_name]=getattr(output,cache_name) next_tokens = sample(output) tokenized_prompt = torch.cat([tokenized_prompt.cpu(), next_tokens[:, None].cpu()], dim=-1) full_profile = next(event for event in prof.key_averages() if event.key == 'model_inference') writer.write(json.dumps({ "model_name": args.model, "model_size": model_size, "token_id": tok_idx, "strategy": args.device, "cpu_time": full_profile.cpu_time, "cuda_time": full_profile.cuda_time, "cpu_memory_usage": full_profile.cpu_memory_usage, "cuda_memory_usage": full_profile.cuda_memory_usage, "self_cpu_memory_usage": full_profile.self_cpu_memory_usage, "self_cuda_memory_usage": full_profile.self_cuda_memory_usage, "max_memory_allocated":torch.cuda.max_memory_allocated(), })+'\n' ) torch.cuda.empty_cache() writer.close() """ python benchmark_inference_time.py --model RWKV/rwkv-4-3b-pile --use_cache --output_path data/inference_time/rwkv-3b.jsonl python benchmark_inference_time.py --model RWKV/rwkv-4-7b-pile --use_cache --output_path data/inference_time/rwkv-7b.jsonl python benchmark_inference_time.py --model RWKV/rwkv-4-14b-pile --use_cache --output_path data/inference_time/rwkv-14b.jsonl python benchmark_inference_time.py --model facebook/opt-2.7b --use_cache --output_path data/inference_time/opt-2.7b.jsonl python benchmark_inference_time.py --model facebook/opt-6.7b --use_cache --output_path data/inference_time/opt-6.7b.jsonl python benchmark_inference_time.py --model EleutherAI/pythia-2.8b --use_cache --output_path data/inference_time/pythia-2.8b.jsonl python benchmark_inference_time.py --model EleutherAI/pythia-6.9b --use_cache --output_path data/inference_time/pythia-6.9b.jsonl python benchmark_inference_time.py --model EleutherAI/gpt-neo-2.7B --use_cache --output_path data/inference_time/gpt-neo-2.7B.jsonl ############# Poltting Code ############## import numpy as np import json def get_jsonl(f): return [json.loads(x) for x in open(f).readlines()] import matplotlib.pyplot as plt fig, (ax1,ax2,ax3) = plt.subplots(1, 3,figsize=(18, 4)) for model_name in [ "rwkv-3b", # "rwkv-7b", # "rwkv-14b", "opt-2.7b", "gpt-neo-2.7B", "pythia-2.8b" ]: data = get_jsonl(f"data/inference_time/{model_name}.jsonl") cuda_time = [x['cuda_time'] for x in data] cumulative_time = np.cumsum(cuda_time)/(1000*1000) memory_usage = [x['max_memory_allocated']/(2**10)/(2**10)/(2**10) for x in data] ax1.plot([x/1000 for x in cuda_time][100:],label=model_name) ax2.plot(cumulative_time,label=model_name) ax3.plot(memory_usage,label=model_name) ax1.set_xlabel("# Tokens") ax1.set_ylabel("Time (ms) to generated the #-th token") ax1.grid() ax1.legend() ax1.set_title("Single Token Generation Latency") ax2.set_xlabel("# Tokens") ax2.set_ylabel("Cumulative time (s) to generated the #-th token") ax2.grid() ax2.legend() ax2.set_title("Cumulative Generation Latency") ax3.set_xlabel("# Tokens") ax3.set_ylabel("Memory usage (GB)") ax3.grid() ax3.legend() ax3.set_title("Memory usage in Generation") """