Initial Upload

Browse files

Files changed (9) hide show

README.md +84 -3
config.json +51 -0
generation_config.json +6 -0
model.safetensors +3 -0
quantization_config.json +24 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,84 @@
----
-license: apache-2.0
----

+---
+language:
+- en
+- fr
+tags:
+- pytorch
+- causal-lm
+- mistral
+- autoround
+- auto-round
+- intel-autoround
+- gptq
+- woq
+- intel
+- pytorch
+- mistralai
+license: apache-2.0
+model_name: Mistral 7B v0.3 Instruct
+base_model:
+- mistralai/Mistral-7B-v0.3-Instruct
+inference: false
+model_creator: mistralai
+pipeline_tag: text-generation
+prompt_template: '{prompt}
+  '
+quantized_by: fbaldassarri
+---
+## Model Information
+Quantized version of [mistralai/Mistral-7B-v0.3-Instruct](https://huggingface.co/mistralai/Mistral-7B-v0.3-Instruct) using torch.float32 for quantization tuning.
+- 4 bits (INT4)
+- group size = 128
+- Symmetrical Quantization
+- Method WoQ (AutoRound format)
+Fast and low memory, 2-3X speedup (slight accuracy drop at W4G128)
+Quantization framework: [Intel AutoRound](https://github.com/intel/auto-round) v0.4.3
+Note: this INT4 version of Mistral-7B-v0.3-Instruct has been quantized to run inference through CPU.
+## Replication Recipe
+### Step 1 Install Requirements
+I suggest to install requirements into a dedicated python-virtualenv or a conda enviroment.
+```
+wget https://github.com/intel/auto-round/archive/refs/tags/v0.4.3.tar.gz
+tar -xvzf v0.4.3.tar.gz
+cd auto-round-0.4.3
+pip install -r requirements-cpu.txt --upgrade
+```
+### Step 2 Build Intel AutoRound wheel from sources
+```
+pip install -vvv --no-build-isolation -e .[cpu]
+```
+### Step 3 Script for Quantization
+```
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  model_name = "mistralai/Mistral-7B-v0.3-Instruct"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  from auto_round import AutoRound
+  bits, group_size, sym, device, amp = 4, 128, True, 'cpu', False
+  autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym, device=device, amp=amp)
+  autoround.quantize()
+  output_dir = "./AutoRound/mistralai_Mistral-7B-v0.3-Instruct-autoround-int4-gs128-sym"
+  autoround.save_quantized(output_dir, format='auto_round', inplace=True)
+```
+## License
+[Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/)
+## Disclaimer
+This quantized model comes with no warranty. It has been developed only for research purposes.

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "quantization_config": {
+    "amp": false,
+    "autoround_version": "0.4.3",
+    "backend": "auto_round:gptq:exllamav2",
+    "batch_size": 4,
+    "bits": 4,
+    "data_type": "int",
+    "dataset": "NeelNanda/pile-10k",
+    "enable_minmax_tuning": true,
+    "enable_norm_bias_tuning": false,
+    "enable_quanted_input": true,
+    "gradient_accumulate_steps": 1,
+    "group_size": 128,
+    "iters": 200,
+    "low_gpu_mem_usage": false,
+    "lr": 0.005,
+    "minmax_lr": 0.005,
+    "nsamples": 128,
+    "quant_method": "intel/auto-round",
+    "scale_dtype": "torch.float16",
+    "seqlen": 512,
+    "sym": true,
+    "to_quant_block_names": null
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 32768
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.47.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77ab7ef744686fd65d19008a525d274a5f29fd31e5d23c6f466e24f4f582aa14
+size 4705872016

quantization_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bits": 4,
+  "group_size": 128,
+  "sym": true,
+  "data_type": "int",
+  "enable_quanted_input": true,
+  "enable_minmax_tuning": true,
+  "seqlen": 512,
+  "batch_size": 4,
+  "scale_dtype": "torch.float16",
+  "lr": 0.005,
+  "minmax_lr": 0.005,
+  "gradient_accumulate_steps": 1,
+  "iters": 200,
+  "amp": false,
+  "nsamples": 128,
+  "low_gpu_mem_usage": false,
+  "to_quant_block_names": null,
+  "enable_norm_bias_tuning": false,
+  "dataset": "NeelNanda/pile-10k",
+  "autoround_version": "0.4.3",
+  "quant_method": "intel/auto-round",
+  "backend": "auto_round:gptq:exllamav2"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff