Initial Upload

Browse files

Files changed (10) hide show

README.md +87 -3
config.json +63 -0
generation_config.json +7 -0
merges.txt +0 -0
model.safetensors +3 -0
quantization_config.json +24 -0
special_tokens_map.json +34 -0
tokenizer.json +0 -0
tokenizer_config.json +155 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,87 @@
----
-license: apache-2.0
----

+---
+language:
+- en
+- de
+- fr
+- it
+- pt
+- hi
+- es
+- th
+license: apache-2.0
+library_name: transformers
+tags:
+- autoround
+- auto-round
+- intel-autoround
+- gptq
+- woq
+- pytorch
+- transformers
+- intel
+model_name: SmolLM2 135M Instruct
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+inference: false
+model_creator: HuggingFaceTB
+pipeline_tag: text-generation
+prompt_template: '{prompt}
+  '
+quantized_by: fbaldassarri
+---
+## Model Information
+Quantized version of [HuggingFaceTB/SmolLM2-135M-Instruct](HuggingFaceTB/SmolLM2-135M-Instruct) using torch.float32 for quantization tuning.
+- 4 bits (INT4)
+- group size = 128
+- Symmetrical Quantization
+- Method WoQ (AutoRound format)
+Fast and low memory, 2-3X speedup (slight accuracy drop at W4G128)
+Quantization framework: [Intel AutoRound](https://github.com/intel/auto-round) v0.4.3
+Note: this INT4 version of SmolLM2-135M-Instruct has been quantized to run inference through CPU.
+## Replication Recipe
+### Step 1 Install Requirements
+I suggest to install requirements into a dedicated python-virtualenv or a conda enviroment.
+```
+wget https://github.com/intel/auto-round/archive/refs/tags/v0.4.3.tar.gz
+tar -xvzf v0.4.3.tar.gz
+cd auto-round-0.4.3
+pip install -r requirements-cpu.txt --upgrade
+```
+### Step 2 Build Intel AutoRound wheel from sources
+```
+pip install -vvv --no-build-isolation -e .[cpu]
+```
+### Step 3 Script for Quantization
+```
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  from auto_round import AutoRound
+  bits, group_size, sym, device, amp = 4, 128, True, 'cpu', False
+  autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym, device=device, amp=amp)
+  autoround.quantize()
+  output_dir = "./AutoRound/HuggingFaceTB_SmolLM2-135M-Instruct-auto_round-int4-gs128-sym"
+  autoround.save_quantized(output_dir, format='auto_round', inplace=True)
+```
+## License
+[Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/)
+## Disclaimer
+This quantized model comes with no warrenty. It has been developed only for research purposes.

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "amp": false,
+    "autoround_version": "0.4.3",
+    "backend": "auto_round:gptq:exllamav2",
+    "batch_size": 4,
+    "bits": 4,
+    "data_type": "int",
+    "dataset": "NeelNanda/pile-10k",
+    "enable_minmax_tuning": true,
+    "enable_norm_bias_tuning": false,
+    "enable_quanted_input": true,
+    "gradient_accumulate_steps": 1,
+    "group_size": 128,
+    "iters": 200,
+    "low_gpu_mem_usage": false,
+    "lr": 0.005,
+    "minmax_lr": 0.005,
+    "nsamples": 128,
+    "quant_method": "intel/auto-round",
+    "scale_dtype": "torch.float16",
+    "seqlen": 512,
+    "sym": true,
+    "to_quant_block_names": null
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers.js_config": {
+    "kv_cache_dtype": {
+      "fp16": "float16",
+      "q4f16": "float16"
+    }
+  },
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.47.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80a8f1248f1e691bb455f760b25e9096dd9d8f887805f4a3c7fc07d77678494f
+size 169414824

quantization_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bits": 4,
+  "group_size": 128,
+  "sym": true,
+  "data_type": "int",
+  "enable_quanted_input": true,
+  "enable_minmax_tuning": true,
+  "seqlen": 512,
+  "batch_size": 4,
+  "scale_dtype": "torch.float16",
+  "lr": 0.005,
+  "minmax_lr": 0.005,
+  "gradient_accumulate_steps": 1,
+  "iters": 200,
+  "amp": false,
+  "nsamples": 128,
+  "low_gpu_mem_usage": false,
+  "to_quant_block_names": null,
+  "enable_norm_bias_tuning": false,
+  "dataset": "NeelNanda/pile-10k",
+  "autoround_version": "0.4.3",
+  "quant_method": "intel/auto-round",
+  "backend": "auto_round:gptq:exllamav2"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff