{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "import math\n", "import multiprocessing\n", "import json\n", "import os\n", "import re\n", "import subprocess\n", "import yaml" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Define base model name and default values for parameters\n", "path_to_llamacpp = '/Users/macdev/Downloads/build/bin'\n", "base_model_name = 'salamandra-2b-instruct'\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def extract_from_config(config_file):\n", " \"\"\"Extract parameters like context size, rope frequency base, and other sampling settings from a config JSON file.\"\"\"\n", " with open(config_file, 'r') as file:\n", " config_data = json.load(file)\n", "\n", " # Extract parameters if present\n", " params = {}\n", " params['ctx_size'] = config_data.get(\"max_position_embeddings\") # Context size\n", " params['rope_freq_base'] = config_data.get(\"rope_theta\") # RoPE frequency base\n", " params['rope_scaling'] = config_data.get(\"rope_scaling\") # RoPE scaling factor\n", " params['rope_scaling_type'] = config_data.get(\"rope_scaling_type\") # RoPE scaling type\n", " params['torch_dtype'] = config_data.get(\"torch_dtype\") # Torch data type\n", " params['top_p'] = config_data.get(\"sampling.top_p\") # Top-p sampling\n", " params['temp'] = config_data.get(\"sampling.temperature\") # Sampling temperature\n", " params['repeat_penalty'] = config_data.get(\"sampling.repeat_penalty\") # Repetition penalty\n", " params['repeat_last_n'] = config_data.get(\"sampling.repeat_last_n\") # Last N tokens for repetition penalty\n", " params['min_p'] = config_data.get(\"sampling.min_p\") # Minimum probability sampling\n", " params['top_k'] = config_data.get(\"sampling.top_k\") # Top-k sampling\n", " params['presence_penalty'] = config_data.get(\"sampling.presence_penalty\") # Presence penalty for repeat tokens\n", " params['frequency_penalty'] = config_data.get(\"sampling.frequency_penalty\") # Frequency penalty for repeat tokens\n", " params['mirostat'] = config_data.get(\"sampling.mirostat\") # Mirostat sampling\n", " params['mirostat_lr'] = config_data.get(\"sampling.mirostat_lr\") # Mirostat learning rate\n", " params['mirostat_ent'] = config_data.get(\"sampling.mirostat_ent\") # Mirostat entropy target\n", " params['tfs'] = config_data.get(\"sampling.tfs\") # Tail free sampling\n", " params['typical'] = config_data.get(\"sampling.typical\") # Locally typical sampling\n", "\n", " return params\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "unquantized = defaultdict(lambda: \"fp16\")\n", "unquantized[\"float32\"] = \"fp32\"\n", "unquantized[\"float16\"] = \"fp16\"\n", "unquantized[\"bfloat16\"] = \"bf16\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def extract_from_generation_config(generation_config_file):\n", " \"\"\"Extract generation-specific parameters relevant to llama-perplexity if available.\"\"\"\n", " with open(generation_config_file, 'r') as file:\n", " generation_data = json.load(file)\n", " \n", " # Extract and map only parameters useful for llama-perplexity\n", " params = {}\n", " params['top_p'] = generation_data.get(\"top_p\") # Top-p sampling\n", " params['temp'] = generation_data.get(\"temperature\") # Sampling temperature\n", " params['repeat_penalty'] = generation_data.get(\"repetition_penalty\") # Repetition penalty\n", " params['repeat_last_n'] = generation_data.get(\"repeat_last_n\") # Last N tokens for repetition penalty\n", " params['top_k'] = generation_data.get(\"top_k\") # Top-k sampling (if present)\n", " params['presence_penalty'] = generation_data.get(\"presence_penalty\") # Presence penalty\n", " params['frequency_penalty'] = generation_data.get(\"frequency_penalty\")# Frequency penalty\n", "\n", " # Remove None values to avoid overwriting defaults\n", " params = {key: value for key, value in params.items() if value is not None}\n", "\n", " return params\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def get_parameters(use_temp=False):\n", " \"\"\"Retrieve parameters from the configuration files or use defaults, preferring generation_config if available.\"\"\"\n", " # Initialize default parameters\n", " config_params = dict()\n", "\n", " # Extract parameters from config.json, if available\n", " try:\n", " config_params.update(extract_from_config('config.json'))\n", " except FileNotFoundError:\n", " print(\"config.json not found. Using default values.\")\n", "\n", " # Extract parameters from generation_config.json, if available and prefer these values\n", " try:\n", " gen_params = extract_from_generation_config('generation_config.json')\n", " # Update config_params with values from gen_params, if they are not None\n", " for key, value in gen_params.items():\n", " if value is not None:\n", " config_params[key] = value\n", " except FileNotFoundError:\n", " print(\"generation_config.json not found. Using default generation values.\")\n", "\n", " # Ensure that temperature ('temp') is never used\n", " if 'temp' in config_params and use_temp is False:\n", " config_params['temp'] = 0 # Set temperature to 0\n", "\n", " return config_params\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'ctx_size': 8192, 'rope_freq_base': 10000.0, 'rope_scaling': None, 'rope_scaling_type': None, 'torch_dtype': 'bfloat16', 'top_p': None, 'temp': 0, 'repeat_penalty': 1.2, 'repeat_last_n': None, 'min_p': None, 'top_k': None, 'presence_penalty': None, 'frequency_penalty': None, 'mirostat': None, 'mirostat_lr': None, 'mirostat_ent': None, 'tfs': None, 'typical': None}\n" ] } ], "source": [ "# Extract configuration parameters\n", "config_params = get_parameters()\n", "print(config_params)\n", "\n", "base_precision = unquantized[config_params[\"torch_dtype\"]]\n", "\n", "base_model = f'{base_model_name}_{base_precision}.gguf'\n", "base_perplexity_file = f\"perplexity_{base_precision}.txt\"\n", "\n", "threads = max(multiprocessing.cpu_count() - 1, 1)\n", "batch_size = 512\n", "ubatch_size = 128\n", "dataset_file = \"imatrix/oscar/imatrix-dataset.txt\" \n", "ppl_file = \"ppl_test_data.txt\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantization types: ['IQ2_S', 'IQ2_M', 'IQ3_M', 'IQ4_NL', 'IQ4_XS', 'Q3_K_L', 'Q3_K_M', 'Q4_K_M', 'Q4_K_S', 'Q5_K_M', 'Q5_K_S', 'Q6_K', 'Q8_0']\n" ] } ], "source": [ "# Load YAML file and extract quantization types\n", "yaml_file = 'quantizations.yaml'\n", "with open(yaml_file, 'r') as file:\n", " data = yaml.safe_load(file)\n", "\n", "# Extract the list of quantization types\n", "quantization_types = data['quantizations']\n", "print(\"Quantization types: \", quantization_types)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Quantization parameters\n", "use_leave_output_tensor = True # Set to False if you don't want to use --leave-output-tensor\n", "\n", "# Optional importance matrix path (set to None if you don't want to include --imatrix)\n", "imatrix_path = \"imatrix/oscar/imatrix.dat\" " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def quantize_model(\n", " quantization_type, \n", " base_model, \n", " base_model_name, \n", " path_to_llamacpp=\"\",\n", " imatrix_path=None, \n", " use_leave_output_tensor=True,\n", " output_dir=\".\"\n", "):\n", " \"\"\"\n", " Quantize the base model into the specified quantization type.\n", "\n", " Parameters:\n", " - quantization_type (str): The type of quantization (e.g., \"Q4_0\", \"Q5_K_M\").\n", " - base_model (str): Path to the base model file (e.g., \"salamandra-2b_bf16.gguf\").\n", " - base_model_name (str): The base name of the model (e.g., \"salamandra-2b\").\n", " - path_to_llamacpp (str): Path to the llama-quantize binary.\n", " - imatrix_path (str, optional): Path to the importance matrix file. Default is None.\n", " - use_leave_output_tensor (bool): Whether to include the --leave-output-tensor flag. Default is True.\n", " - output_dir (str): Directory where the quantized models and logs will be saved. Default is current directory.\n", "\n", " Returns:\n", " - None\n", " \"\"\"\n", " # Construct the output model path\n", " output_model = os.path.join(output_dir, f\"{base_model_name}_{quantization_type}.gguf\")\n", "\n", " # Check if the quantized model already exists\n", " if os.path.exists(output_model):\n", " print(f\"Quantized model {output_model} already exists. Skipping quantization.\")\n", " return\n", "\n", " # Build the llama-quantize command\n", " command_parts = [\n", " os.path.join(path_to_llamacpp, \"llama-quantize\")\n", " ]\n", "\n", " # Conditionally add the --imatrix argument if the path is provided\n", " if imatrix_path:\n", " command_parts.append(f\"--imatrix {imatrix_path}\")\n", "\n", " # Conditionally add the --leave-output-tensor argument based on the external boolean\n", " if use_leave_output_tensor:\n", " command_parts.append(\"--leave-output-tensor\")\n", "\n", " # Add base model, output model, and quantization type\n", " command_parts.extend([\n", " f\"{base_model}\",\n", " f\"\\\"{output_model}\\\"\",\n", " f\"{quantization_type}\"\n", " ])\n", "\n", " # Redirect output to a log file for each quantization type\n", " log_file = os.path.join(output_dir, f\"{quantization_type}_log.txt\")\n", " command_parts.append(f\"> \\\"{log_file}\\\" 2>&1\")\n", "\n", " # Join the command parts into a single command string\n", " quantize_command = \" \".join(command_parts)\n", "\n", " # Run the quantization command\n", " print(f\"Quantizing model to {quantization_type} format with command: {quantize_command}\")\n", " result = subprocess.run(quantize_command, shell=True, text=True)\n", " if result.returncode != 0:\n", " print(f\"Error during quantization to {quantization_type}. Check {log_file} for details.\")\n", " else:\n", " print(f\"Successfully quantized model to {quantization_type} and saved as {output_model}.\")\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def run_command(command):\n", " \"\"\"Function to run a command and capture output\"\"\"\n", " print(f\"Running command: {command}\")\n", " result = subprocess.run(command, shell=True, capture_output=True, text=True)\n", " if result.returncode != 0:\n", " print(f\"Error executing command: {result.stderr}\")\n", " return result.stdout\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def extract_perplexity(output):\n", " \"\"\"extract perplexity from the output\"\"\"\n", " match = re.search(r\"Final estimate: PPL = ([\\d.]+)\", output)\n", " if match:\n", " return float(match.group(1))\n", " return None\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def build_command(model, output_file, ppl_file, config_params, threads=8, batch_size=512, ubatch_size=128):\n", " \"\"\"Build the perplexity command based on the provided parameters.\"\"\"\n", " command_parts = [\n", " \"/Users/macdev/Downloads/build/bin/llama-perplexity\",\n", " f\"-m {model}\",\n", " f\"-f {ppl_file}\",\n", " \"--perplexity\",\n", " ]\n", "\n", " # Add parameters only if they are set in config_params\n", " if config_params.get('ctx_size') is not None:\n", " command_parts.append(f\"--ctx-size {config_params['ctx_size']}\")\n", " if config_params.get('rope_freq_base') is not None:\n", " command_parts.append(f\"--rope-freq-base {config_params['rope_freq_base']}\")\n", " if config_params.get('rope_freq_scale') is not None:\n", " command_parts.append(f\"--rope-freq-scale {config_params['rope_freq_scale']}\")\n", " if config_params.get('rope_scaling_type') is not None:\n", " command_parts.append(f\"--rope-scaling {config_params['rope_scaling_type']}\")\n", "\n", " # Add sampling-related parameters if they are set\n", " if config_params.get('top_p') is not None:\n", " command_parts.append(f\"--top-p {config_params['top_p']}\")\n", " if config_params.get('repeat_penalty') is not None:\n", " command_parts.append(f\"--repeat-penalty {config_params['repeat_penalty']}\")\n", " if config_params.get('repeat_last_n') is not None:\n", " command_parts.append(f\"--repeat-last-n {config_params['repeat_last_n']}\")\n", "\n", " # Do not include `temp` as it's set to 0 in `get_parameters` if `use_temp` is False\n", " # Only add if temp is non-zero (if `use_temp` is True in get_parameters)\n", " if config_params.get('temp') is not None and config_params['temp'] != 0:\n", " command_parts.append(f\"--temp {config_params['temp']}\")\n", "\n", " # Add fixed parameters for threads and batch sizes\n", " command_parts.extend([\n", " f\"--threads {threads}\",\n", " f\"--batch-size {batch_size}\",\n", " f\"--ubatch-size {ubatch_size}\",\n", " ])\n", "\n", " # Redirect output to file\n", " command = \" \".join(command_parts) + f\" > {output_file} 2>&1\"\n", " return command\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Measure perplexity for the base model\n", "if os.path.exists(f'perplexity_{base_precision}.txt'):\n", " with open(base_perplexity_file, 'r') as file:\n", " base_output = file.read()\n", "else:\n", " base_command = build_command(base_model, base_perplexity_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size= ubatch_size)\n", " base_output = run_command(base_command)\n", "base_perplexity = extract_perplexity(base_output)\n", "calculated_perplexity_recently = False # This will be set to True later" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantizing model to IQ2_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_IQ2_S.gguf\" IQ2_S > \"./IQ2_S_log.txt\" 2>&1\n", "Successfully quantized model to IQ2_S and saved as ./salamandra-2b-instruct_IQ2_S.gguf.\n", "Quantizing model to IQ2_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_IQ2_M.gguf\" IQ2_M > \"./IQ2_M_log.txt\" 2>&1\n", "Successfully quantized model to IQ2_M and saved as ./salamandra-2b-instruct_IQ2_M.gguf.\n", "Quantizing model to IQ3_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_IQ3_M.gguf\" IQ3_M > \"./IQ3_M_log.txt\" 2>&1\n", "Successfully quantized model to IQ3_M and saved as ./salamandra-2b-instruct_IQ3_M.gguf.\n", "Quantizing model to IQ4_NL format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_IQ4_NL.gguf\" IQ4_NL > \"./IQ4_NL_log.txt\" 2>&1\n", "Successfully quantized model to IQ4_NL and saved as ./salamandra-2b-instruct_IQ4_NL.gguf.\n", "Quantizing model to IQ4_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_IQ4_XS.gguf\" IQ4_XS > \"./IQ4_XS_log.txt\" 2>&1\n", "Successfully quantized model to IQ4_XS and saved as ./salamandra-2b-instruct_IQ4_XS.gguf.\n", "Quantizing model to Q3_K_L format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q3_K_L.gguf\" Q3_K_L > \"./Q3_K_L_log.txt\" 2>&1\n", "Successfully quantized model to Q3_K_L and saved as ./salamandra-2b-instruct_Q3_K_L.gguf.\n", "Quantizing model to Q3_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q3_K_M.gguf\" Q3_K_M > \"./Q3_K_M_log.txt\" 2>&1\n", "Successfully quantized model to Q3_K_M and saved as ./salamandra-2b-instruct_Q3_K_M.gguf.\n", "Quantizing model to Q4_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q4_K_M.gguf\" Q4_K_M > \"./Q4_K_M_log.txt\" 2>&1\n", "Successfully quantized model to Q4_K_M and saved as ./salamandra-2b-instruct_Q4_K_M.gguf.\n", "Quantizing model to Q4_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q4_K_S.gguf\" Q4_K_S > \"./Q4_K_S_log.txt\" 2>&1\n", "Successfully quantized model to Q4_K_S and saved as ./salamandra-2b-instruct_Q4_K_S.gguf.\n", "Quantizing model to Q5_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q5_K_M.gguf\" Q5_K_M > \"./Q5_K_M_log.txt\" 2>&1\n", "Successfully quantized model to Q5_K_M and saved as ./salamandra-2b-instruct_Q5_K_M.gguf.\n", "Quantizing model to Q5_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q5_K_S.gguf\" Q5_K_S > \"./Q5_K_S_log.txt\" 2>&1\n", "Successfully quantized model to Q5_K_S and saved as ./salamandra-2b-instruct_Q5_K_S.gguf.\n", "Quantizing model to Q6_K format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q6_K.gguf\" Q6_K > \"./Q6_K_log.txt\" 2>&1\n", "Successfully quantized model to Q6_K and saved as ./salamandra-2b-instruct_Q6_K.gguf.\n", "Quantizing model to Q8_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf \"./salamandra-2b-instruct_Q8_0.gguf\" Q8_0 > \"./Q8_0_log.txt\" 2>&1\n", "Successfully quantized model to Q8_0 and saved as ./salamandra-2b-instruct_Q8_0.gguf.\n" ] } ], "source": [ "# Quantize the models\n", "for quant in quantization_types:\n", " quantize_model(\n", " quantization_type=quant,\n", " base_model=base_model,\n", " base_model_name=base_model_name,\n", " path_to_llamacpp=path_to_llamacpp,\n", " imatrix_path=imatrix_path,\n", " use_leave_output_tensor=use_leave_output_tensor,\n", " )" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ2_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ2_M.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ3_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_M.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ4_NL.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_NL.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ4_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_XS.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q3_K_L.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_L.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q3_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_M.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q4_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_M.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q4_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_S.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q5_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_M.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q5_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_S.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q6_K.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q6_K.txt 2>&1\n", "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_Q8_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q8_0.txt 2>&1\n" ] } ], "source": [ "# Measure perplexity for each quantized model\n", "perplexity_results = dict()\n", "perplexity_results[base_precision] = base_perplexity\n", "for quant in quantization_types:\n", " calculated_perplexity_recently = True\n", " \n", " model = f\"{base_model_name}_{quant}.gguf\"\n", " output_file = f\"perplexity_{quant}.txt\"\n", "\n", " command = build_command(model, output_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size= ubatch_size)\n", " output = run_command(command)\n", "\n", " perplexity = extract_perplexity(output)\n", " perplexity_results[quant] = perplexity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load previous measurements if we didnt just measure perplexity for each quantized model\n", "if not calculated_perplexity_recently:\n", " perplexity_results = dict()\n", " perplexity_results[base_precision] = base_perplexity\n", "\n", " for quant in quantization_types:\n", " output_file = f\"perplexity_{quant}.txt\"\n", " try:\n", " with open(output_file, 'r') as file:\n", " output = file.read()\n", " perplexity = extract_perplexity(output)\n", " except FileNotFoundError:\n", " print(f\"Output file {output_file} not found.\")\n", " perplexity = None\n", "\n", " perplexity_results[quant] = perplexity\n", "\n", " # Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n", " print(\"\\nPerplexity Comparison Table:\")\n", " print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n", " print(\"=\" * 55)\n", " for quant, ppl in perplexity_results.items():\n", " if ppl and base_perplexity:\n", " ln_ratio = round(math.log(ppl / base_perplexity), 6)\n", " print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n", "\n", " print(perplexity_results)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n", "print(\"\\nPerplexity Comparison Table:\")\n", "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n", "print(\"=\" * 55)\n", "for quant, ppl in perplexity_results.items():\n", " if ppl and base_perplexity:\n", " ln_ratio = round(math.log(ppl / base_perplexity), 6)\n", " print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Perplexity Comparison Table:\n", "Quantization Type PPL(Q) ln(PPL(Q)/PPL(fp16)) \n", "=======================================================\n", "bf16 15.3799 0.0 \n", "IQ2_S 25.3893 0.501266 \n", "IQ2_M 21.6684 0.342794 \n", "IQ3_M 16.774 0.086769 \n", "IQ4_NL 15.9602 0.037037 \n", "IQ4_XS 15.9591 0.036968 \n", "Q3_K_L 16.5067 0.070705 \n", "Q3_K_M 16.8567 0.091687 \n", "Q4_K_M 15.8651 0.03106 \n", "Q4_K_S 15.9346 0.035431 \n", "Q5_K_M 15.4746 0.006139 \n", "Q5_K_S 15.4901 0.00714 \n", "Q6_K 15.3961 0.001053 \n", "Q8_0 15.3831 0.000208 \n", "{'bf16': 15.3799, 'IQ2_S': 25.3893, 'IQ2_M': 21.6684, 'IQ3_M': 16.774, 'IQ4_NL': 15.9602, 'IQ4_XS': 15.9591, 'Q3_K_L': 16.5067, 'Q3_K_M': 16.8567, 'Q4_K_M': 15.8651, 'Q4_K_S': 15.9346, 'Q5_K_M': 15.4746, 'Q5_K_S': 15.4901, 'Q6_K': 15.3961, 'Q8_0': 15.3831}\n" ] } ], "source": [ "perplexity_results = dict()\n", "perplexity_results[base_precision] = base_perplexity\n", "\n", "for quant in quantization_types:\n", " output_file = f\"perplexity_{quant}.txt\"\n", " try:\n", " with open(output_file, 'r') as file:\n", " output = file.read()\n", " perplexity = extract_perplexity(output)\n", " except FileNotFoundError:\n", " print(f\"Output file {output_file} not found.\")\n", " perplexity = None\n", "\n", " perplexity_results[quant] = perplexity\n", "\n", "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n", "print(\"\\nPerplexity Comparison Table:\")\n", "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n", "print(\"=\" * 55)\n", "for quant, ppl in perplexity_results.items():\n", " if ppl and base_perplexity:\n", " ln_ratio = round(math.log(ppl / base_perplexity), 6)\n", " print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n", "\n", "print(perplexity_results)\n" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 2 }