kreas
/

FA_benchmarks

Model card Files Files and versions Community

kreas commited on Mar 14, 2024

Commit

a45b339

verified ·

1 Parent(s): 12902c5

Upload /Llama-2-7b-hf/int8_batch_size_1_sq_len_256_new_tokens_256/experiment_config.json with huggingface_hub

Browse files

Files changed (1) hide show

Llama-2-7b-hf/int8_batch_size_1_sq_len_256_new_tokens_256/experiment_config.json +103 -0

Llama-2-7b-hf/int8_batch_size_1_sq_len_256_new_tokens_256/experiment_config.json ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+    "backend": {
+        "name": "pytorch",
+        "version": "2.2.1+cu118",
+        "_target_": "optimum_benchmark.backends.pytorch.backend.PyTorchBackend",
+        "model": "meta-llama/Llama-2-7b-hf",
+        "task": "text-generation",
+        "library": "transformers",
+        "device": "cuda",
+        "device_ids": "0",
+        "seed": 42,
+        "inter_op_num_threads": null,
+        "intra_op_num_threads": null,
+        "hub_kwargs": {
+            "revision": "main",
+            "force_download": false,
+            "local_files_only": false,
+            "trust_remote_code": false
+        },
+        "no_weights": true,
+        "device_map": null,
+        "torch_dtype": "float16",
+        "amp_autocast": false,
+        "amp_dtype": null,
+        "eval_mode": true,
+        "to_bettertransformer": false,
+        "low_cpu_mem_usage": null,
+        "attn_implementation": "flash_attention_2",
+        "cache_implementation": null,
+        "torch_compile": false,
+        "torch_compile_config": {},
+        "quantization_scheme": "bnb",
+        "quantization_config": {
+            "llm_int8_threshold": 0.0,
+            "load_in_8bit": true
+        },
+        "deepspeed_inference": false,
+        "deepspeed_inference_config": {},
+        "peft_type": null,
+        "peft_config": {}
+    },
+    "launcher": {
+        "name": "process",
+        "_target_": "optimum_benchmark.launchers.process.launcher.ProcessLauncher",
+        "device_isolation": false,
+        "start_method": "spawn"
+    },
+    "benchmark": {
+        "name": "inference",
+        "_target_": "optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark",
+        "duration": 10,
+        "warmup_runs": 10,
+        "input_shapes": {
+            "batch_size": 1,
+            "num_choices": 2,
+            "sequence_length": 256
+        },
+        "new_tokens": null,
+        "energy": false,
+        "memory": true,
+        "latency": true,
+        "forward_kwargs": {},
+        "generate_kwargs": {
+            "max_new_tokens": 256,
+            "min_new_tokens": 256
+        },
+        "call_kwargs": {}
+    },
+    "experiment_name": "Llama-2-7b-hf-int8",
+    "task": null,
+    "model": null,
+    "device": null,
+    "library": null,
+    "environment": {
+        "cpu": " AMD Ryzen Threadripper PRO 5995WX 64-Cores",
+        "cpu_count": 128,
+        "cpu_ram_mb": 134841.131008,
+        "system": "Linux",
+        "machine": "x86_64",
+        "platform": "Linux-6.5.0-14-generic-x86_64-with-glibc2.35",
+        "processor": "x86_64",
+        "python_version": "3.10.12",
+        "gpu": [
+            "NVIDIA GeForce RTX 4090"
+        ],
+        "gpu_count": 1,
+        "gpu_vram_mb": 25757220864,
+        "optimum_benchmark_version": "0.2.0",
+        "optimum_benchmark_commit": "5bf349dbbc5ecdbf6ca94ac70f80ac44bd84dcc0",
+        "transformers_version": "4.38.2",
+        "transformers_commit": null,
+        "accelerate_version": "0.28.0",
+        "accelerate_commit": null,
+        "diffusers_version": null,
+        "diffusers_commit": null,
+        "optimum_version": null,
+        "optimum_commit": null,
+        "timm_version": null,
+        "timm_commit": null,
+        "peft_version": null,
+        "peft_commit": null
+    }
+}