|
main: build = 3906 (7eee341b) |
|
main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0 |
|
main: quantizing 'salamandra-2b-instruct_bf16.gguf' to './salamandra-2b-instruct_Q5_K_M.gguf' as Q5_K_M |
|
llama_model_loader: loaded meta data with 31 key-value pairs and 219 tensors from salamandra-2b-instruct_bf16.gguf (version GGUF V3 (latest)) |
|
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. |
|
llama_model_loader: - kv 0: general.architecture str = llama |
|
llama_model_loader: - kv 1: general.type str = model |
|
llama_model_loader: - kv 2: general.size_label str = 2.3B |
|
llama_model_loader: - kv 3: general.license str = apache-2.0 |
|
llama_model_loader: - kv 4: general.tags arr[str,1] = ["text-generation"] |
|
llama_model_loader: - kv 5: general.languages arr[str,36] = ["bg", "ca", "code", "cs", "cy", "da"... |
|
llama_model_loader: - kv 6: llama.block_count u32 = 24 |
|
llama_model_loader: - kv 7: llama.context_length u32 = 8192 |
|
llama_model_loader: - kv 8: llama.embedding_length u32 = 2048 |
|
llama_model_loader: - kv 9: llama.feed_forward_length u32 = 5440 |
|
llama_model_loader: - kv 10: llama.attention.head_count u32 = 16 |
|
llama_model_loader: - kv 11: llama.attention.head_count_kv u32 = 16 |
|
llama_model_loader: - kv 12: llama.rope.freq_base f32 = 10000.000000 |
|
llama_model_loader: - kv 13: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 |
|
llama_model_loader: - kv 14: general.file_type u32 = 32 |
|
llama_model_loader: - kv 15: llama.vocab_size u32 = 256000 |
|
llama_model_loader: - kv 16: llama.rope.dimension_count u32 = 128 |
|
llama_model_loader: - kv 17: tokenizer.ggml.add_space_prefix bool = true |
|
llama_model_loader: - kv 18: tokenizer.ggml.model str = llama |
|
llama_model_loader: - kv 19: tokenizer.ggml.pre str = default |
|
llama_model_loader: - kv 20: tokenizer.ggml.tokens arr[str,256000] = ["<unk>", "<s>", "</s>", "<pad>", "<|... |
|
llama_model_loader: - kv 21: tokenizer.ggml.scores arr[f32,256000] = [-1000.000000, -1000.000000, -1000.00... |
|
llama_model_loader: - kv 22: tokenizer.ggml.token_type arr[i32,256000] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ... |
|
llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 1 |
|
llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2 |
|
llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 0 |
|
llama_model_loader: - kv 26: tokenizer.ggml.padding_token_id u32 = 0 |
|
llama_model_loader: - kv 27: tokenizer.ggml.add_bos_token bool = true |
|
llama_model_loader: - kv 28: tokenizer.ggml.add_eos_token bool = false |
|
llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if not date_string is defined %}{... |
|
llama_model_loader: - kv 30: general.quantization_version u32 = 2 |
|
llama_model_loader: - type f32: 49 tensors |
|
llama_model_loader: - type bf16: 170 tensors |
|
================================ Have weights data with 168 entries |
|
[ 1/ 219] output.weight - [ 2048, 256000, 1, 1], type = bf16, size = 1000.000 MB |
|
[ 2/ 219] token_embd.weight - [ 2048, 256000, 1, 1], type = bf16, |
|
====== llama_model_quantize_internal: did not find weights for token_embd.weight |
|
converting to q5_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt' |
|
load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks |
|
prepare_imatrix: have 168 importance matrix entries |
|
size = 1000.00 MiB -> 343.75 MiB |
|
[ 3/ 219] blk.0.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 4/ 219] blk.0.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 5/ 219] blk.0.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 6/ 219] blk.0.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 7/ 219] blk.0.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 8/ 219] blk.0.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 9/ 219] blk.0.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 10/ 219] blk.0.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 11/ 219] blk.0.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 12/ 219] blk.1.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 13/ 219] blk.1.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 14/ 219] blk.1.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 15/ 219] blk.1.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 16/ 219] blk.1.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 17/ 219] blk.1.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 18/ 219] blk.1.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 19/ 219] blk.1.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 20/ 219] blk.1.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 21/ 219] blk.10.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 22/ 219] blk.10.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 23/ 219] blk.10.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 24/ 219] blk.10.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 25/ 219] blk.10.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 26/ 219] blk.10.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 27/ 219] blk.10.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 28/ 219] blk.10.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 29/ 219] blk.10.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 30/ 219] blk.11.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 31/ 219] blk.11.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 32/ 219] blk.11.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 33/ 219] blk.11.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 34/ 219] blk.11.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 35/ 219] blk.11.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 36/ 219] blk.11.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 37/ 219] blk.11.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 38/ 219] blk.11.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 39/ 219] blk.12.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 40/ 219] blk.12.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 41/ 219] blk.12.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 42/ 219] blk.12.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 43/ 219] blk.12.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 44/ 219] blk.12.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 45/ 219] blk.12.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 46/ 219] blk.12.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 47/ 219] blk.12.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 48/ 219] blk.13.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 49/ 219] blk.13.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 50/ 219] blk.13.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 51/ 219] blk.13.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 52/ 219] blk.13.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 53/ 219] blk.13.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 54/ 219] blk.13.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 55/ 219] blk.13.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 56/ 219] blk.13.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 57/ 219] blk.14.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 58/ 219] blk.14.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 59/ 219] blk.14.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 60/ 219] blk.14.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 61/ 219] blk.14.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 62/ 219] blk.14.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 63/ 219] blk.14.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 64/ 219] blk.14.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 65/ 219] blk.14.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 66/ 219] blk.15.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 67/ 219] blk.15.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 68/ 219] blk.15.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 69/ 219] blk.15.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 70/ 219] blk.15.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 71/ 219] blk.15.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 72/ 219] blk.15.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 73/ 219] blk.15.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 74/ 219] blk.15.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 75/ 219] blk.16.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 76/ 219] blk.16.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 77/ 219] blk.16.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 78/ 219] blk.16.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 79/ 219] blk.16.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 80/ 219] blk.16.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 81/ 219] blk.16.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 82/ 219] blk.16.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 83/ 219] blk.16.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 84/ 219] blk.17.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 85/ 219] blk.17.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 86/ 219] blk.17.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 87/ 219] blk.17.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 88/ 219] blk.17.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 89/ 219] blk.17.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 90/ 219] blk.17.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 91/ 219] blk.17.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 92/ 219] blk.17.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 93/ 219] blk.18.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 94/ 219] blk.18.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 95/ 219] blk.18.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 96/ 219] blk.18.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 97/ 219] blk.18.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 98/ 219] blk.18.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 99/ 219] blk.18.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 100/ 219] blk.18.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 101/ 219] blk.18.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 102/ 219] blk.19.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 103/ 219] blk.19.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 104/ 219] blk.19.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 105/ 219] blk.19.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 106/ 219] blk.19.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 107/ 219] blk.19.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 108/ 219] blk.19.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 109/ 219] blk.19.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 110/ 219] blk.19.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 111/ 219] blk.2.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 112/ 219] blk.2.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 113/ 219] blk.2.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 114/ 219] blk.2.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 115/ 219] blk.2.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 116/ 219] blk.2.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 117/ 219] blk.2.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 118/ 219] blk.2.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 119/ 219] blk.2.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 120/ 219] blk.20.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 121/ 219] blk.20.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 122/ 219] blk.20.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 123/ 219] blk.20.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 124/ 219] blk.20.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 125/ 219] blk.20.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 126/ 219] blk.20.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 127/ 219] blk.20.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 128/ 219] blk.20.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 129/ 219] blk.21.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 130/ 219] blk.21.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 131/ 219] blk.21.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 132/ 219] blk.21.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 133/ 219] blk.21.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 134/ 219] blk.21.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 135/ 219] blk.21.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 136/ 219] blk.21.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 137/ 219] blk.21.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 138/ 219] blk.22.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 139/ 219] blk.22.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 140/ 219] blk.22.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 141/ 219] blk.22.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 142/ 219] blk.22.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 143/ 219] blk.22.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 144/ 219] blk.22.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 145/ 219] blk.22.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 146/ 219] blk.22.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 147/ 219] blk.23.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 148/ 219] blk.23.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 149/ 219] blk.23.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 150/ 219] blk.23.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 151/ 219] blk.23.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 152/ 219] blk.23.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 153/ 219] blk.23.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 154/ 219] blk.23.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 155/ 219] blk.23.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 156/ 219] blk.3.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 157/ 219] blk.3.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 158/ 219] blk.3.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 159/ 219] blk.3.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 160/ 219] blk.3.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 161/ 219] blk.3.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 162/ 219] blk.3.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 163/ 219] blk.3.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 164/ 219] blk.3.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 165/ 219] blk.4.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 166/ 219] blk.4.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 167/ 219] blk.4.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 168/ 219] blk.4.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 169/ 219] blk.4.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 170/ 219] blk.4.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 171/ 219] blk.4.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 172/ 219] blk.4.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 173/ 219] blk.4.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 174/ 219] blk.5.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 175/ 219] blk.5.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1 |
|
converting to q5_1 .. size = 21.25 MiB -> 7.97 MiB |
|
[ 176/ 219] blk.5.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 177/ 219] blk.5.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 178/ 219] blk.5.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 179/ 219] blk.5.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 180/ 219] blk.5.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 181/ 219] blk.5.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 182/ 219] blk.5.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 183/ 219] blk.6.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 184/ 219] blk.6.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 185/ 219] blk.6.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 186/ 219] blk.6.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 187/ 219] blk.6.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 188/ 219] blk.6.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 189/ 219] blk.6.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 190/ 219] blk.6.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 191/ 219] blk.6.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 192/ 219] blk.7.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 193/ 219] blk.7.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 194/ 219] blk.7.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 195/ 219] blk.7.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 196/ 219] blk.7.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 197/ 219] blk.7.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 198/ 219] blk.7.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 199/ 219] blk.7.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 200/ 219] blk.7.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 201/ 219] blk.8.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 202/ 219] blk.8.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 203/ 219] blk.8.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 204/ 219] blk.8.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 205/ 219] blk.8.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 206/ 219] blk.8.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 207/ 219] blk.8.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 208/ 219] blk.8.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 209/ 219] blk.8.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 210/ 219] blk.9.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 211/ 219] blk.9.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16, |
|
|
|
llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0 |
|
converting to q8_0 .. size = 21.25 MiB -> 11.29 MiB |
|
[ 212/ 219] blk.9.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 213/ 219] blk.9.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to q5_K .. size = 21.25 MiB -> 7.30 MiB |
|
[ 214/ 219] blk.9.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
[ 215/ 219] blk.9.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 216/ 219] blk.9.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 217/ 219] blk.9.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q5_K .. size = 8.00 MiB -> 2.75 MiB |
|
[ 218/ 219] blk.9.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to q6_K .. size = 8.00 MiB -> 3.28 MiB |
|
[ 219/ 219] output_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB |
|
llama_model_quantize_internal: model size = 4298.38 MB |
|
llama_model_quantize_internal: quant size = 2196.23 MB |
|
llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization |
|
|
|
main: quantize time = 9470.02 ms |
|
main: total time = 9470.02 ms |
|
|