In [2]:
from collections import defaultdict
import math
import multiprocessing
import json
import os
import re
import subprocess
import yaml

In [5]:
# Define base model name and default values for parameters
path_to_llamacpp = '/Users/macdev/Downloads/build/bin'
base_model_name = 'salamandra-2b-instruct'


In [8]:
def extract_from_config(config_file):
 """Extract parameters like context size, rope frequency base, and other sampling settings from a config JSON file."""
 with open(config_file, 'r') as file:
 config_data = json.load(file)

 # Extract parameters if present
 params = {}
 params['ctx_size'] = config_data.get("max_position_embeddings") # Context size
 params['rope_freq_base'] = config_data.get("rope_theta") # RoPE frequency base
 params['rope_scaling'] = config_data.get("rope_scaling") # RoPE scaling factor
 params['rope_scaling_type'] = config_data.get("rope_scaling_type") # RoPE scaling type
 params['torch_dtype'] = config_data.get("torch_dtype") # Torch data type
 params['top_p'] = config_data.get("sampling.top_p") # Top-p sampling
 params['temp'] = config_data.get("sampling.temperature") # Sampling temperature
 params['repeat_penalty'] = config_data.get("sampling.repeat_penalty") # Repetition penalty
 params['repeat_last_n'] = config_data.get("sampling.repeat_last_n") # Last N tokens for repetition penalty
 params['min_p'] = config_data.get("sampling.min_p") # Minimum probability sampling
 params['top_k'] = config_data.get("sampling.top_k") # Top-k sampling
 params['presence_penalty'] = config_data.get("sampling.presence_penalty") # Presence penalty for repeat tokens
 params['frequency_penalty'] = config_data.get("sampling.frequency_penalty") # Frequency penalty for repeat tokens
 params['mirostat'] = config_data.get("sampling.mirostat") # Mirostat sampling
 params['mirostat_lr'] = config_data.get("sampling.mirostat_lr") # Mirostat learning rate
 params['mirostat_ent'] = config_data.get("sampling.mirostat_ent") # Mirostat entropy target
 params['tfs'] = config_data.get("sampling.tfs") # Tail free sampling
 params['typical'] = config_data.get("sampling.typical") # Locally typical sampling

 return params


In [7]:
unquantized = defaultdict(lambda: "fp16")
unquantized["float32"] = "fp32"
unquantized["float16"] = "fp16"
unquantized["bfloat16"] = "bf16"

In [6]:
def extract_from_generation_config(generation_config_file):
 """Extract generation-specific parameters relevant to llama-perplexity if available."""
 with open(generation_config_file, 'r') as file:
 generation_data = json.load(file)
 
 # Extract and map only parameters useful for llama-perplexity
 params = {}
 params['top_p'] = generation_data.get("top_p") # Top-p sampling
 params['temp'] = generation_data.get("temperature") # Sampling temperature
 params['repeat_penalty'] = generation_data.get("repetition_penalty") # Repetition penalty
 params['repeat_last_n'] = generation_data.get("repeat_last_n") # Last N tokens for repetition penalty
 params['top_k'] = generation_data.get("top_k") # Top-k sampling (if present)
 params['presence_penalty'] = generation_data.get("presence_penalty") # Presence penalty
 params['frequency_penalty'] = generation_data.get("frequency_penalty")# Frequency penalty

 # Remove None values to avoid overwriting defaults
 params = {key: value for key, value in params.items() if value is not None}

 return params


In [9]:
def get_parameters(use_temp=False):
 """Retrieve parameters from the configuration files or use defaults, preferring generation_config if available."""
 # Initialize default parameters
 config_params = dict()

 # Extract parameters from config.json, if available
 try:
 config_params.update(extract_from_config('config.json'))
 except FileNotFoundError:
 print("config.json not found. Using default values.")

 # Extract parameters from generation_config.json, if available and prefer these values
 try:
 gen_params = extract_from_generation_config('generation_config.json')
 # Update config_params with values from gen_params, if they are not None
 for key, value in gen_params.items():
 if value is not None:
 config_params[key] = value
 except FileNotFoundError:
 print("generation_config.json not found. Using default generation values.")

 # Ensure that temperature ('temp') is never used
 if 'temp' in config_params and use_temp is False:
 config_params['temp'] = 0 # Set temperature to 0

 return config_params


In [10]:
# Extract configuration parameters
config_params = get_parameters()
print(config_params)

base_precision = unquantized[config_params["torch_dtype"]]

base_model = f'{base_model_name}_{base_precision}.gguf'
base_perplexity_file = f"perplexity_{base_precision}.txt"

threads = max(multiprocessing.cpu_count() - 1, 1)
batch_size = 512
ubatch_size = 128
dataset_file = "imatrix/oscar/imatrix-dataset.txt" 
ppl_file = "ppl_test_data.txt"

{'ctx_size': 8192, 'rope_freq_base': 10000.0, 'rope_scaling': None, 'rope_scaling_type': None, 'torch_dtype': 'bfloat16', 'top_p': None, 'temp': 0, 'repeat_penalty': 1.2, 'repeat_last_n': None, 'min_p': None, 'top_k': None, 'presence_penalty': None, 'frequency_penalty': None, 'mirostat': None, 'mirostat_lr': None, 'mirostat_ent': None, 'tfs': None, 'typical': None}


In [3]:
# Load YAML file and extract quantization types
yaml_file = 'quantizations.yaml'
with open(yaml_file, 'r') as file:
 data = yaml.safe_load(file)

# Extract the list of quantization types
quantization_types = data['quantizations']
print("Quantization types: ", quantization_types)

Quantization types: ['IQ2_S', 'IQ2_M', 'IQ3_M', 'IQ4_NL', 'IQ4_XS', 'Q3_K_L', 'Q3_K_M', 'Q4_K_M', 'Q4_K_S', 'Q5_K_M', 'Q5_K_S', 'Q6_K', 'Q8_0']


In [12]:
# Quantization parameters
use_leave_output_tensor = True # Set to False if you don't want to use --leave-output-tensor

# Optional importance matrix path (set to None if you don't want to include --imatrix)
imatrix_path = "imatrix/oscar/imatrix.dat" 

In [13]:
def quantize_model(
 quantization_type, 
 base_model, 
 base_model_name, 
 path_to_llamacpp="",
 imatrix_path=None, 
 use_leave_output_tensor=True,
 output_dir="."
):
 """
 Quantize the base model into the specified quantization type.

 Parameters:
 - quantization_type (str): The type of quantization (e.g., "Q4_0", "Q5_K_M").
 - base_model (str): Path to the base model file (e.g., "salamandra-2b_bf16.gguf").
 - base_model_name (str): The base name of the model (e.g., "salamandra-2b").
 - path_to_llamacpp (str): Path to the llama-quantize binary.
 - imatrix_path (str, optional): Path to the importance matrix file. Default is None.
 - use_leave_output_tensor (bool): Whether to include the --leave-output-tensor flag. Default is True.
 - output_dir (str): Directory where the quantized models and logs will be saved. Default is current directory.

 Returns:
 - None
 """
 # Construct the output model path
 output_model = os.path.join(output_dir, f"{base_model_name}_{quantization_type}.gguf")

 # Check if the quantized model already exists
 if os.path.exists(output_model):
 print(f"Quantized model {output_model} already exists. Skipping quantization.")
 return

 # Build the llama-quantize command
 command_parts = [
 os.path.join(path_to_llamacpp, "llama-quantize")
 ]

 # Conditionally add the --imatrix argument if the path is provided
 if imatrix_path:
 command_parts.append(f"--imatrix {imatrix_path}")

 # Conditionally add the --leave-output-tensor argument based on the external boolean
 if use_leave_output_tensor:
 command_parts.append("--leave-output-tensor")

 # Add base model, output model, and quantization type
 command_parts.extend([
 f"{base_model}",
 f"\"{output_model}\"",
 f"{quantization_type}"
 ])

 # Redirect output to a log file for each quantization type
 log_file = os.path.join(output_dir, f"{quantization_type}_log.txt")
 command_parts.append(f"> \"{log_file}\" 2>&1")

 # Join the command parts into a single command string
 quantize_command = " ".join(command_parts)

 # Run the quantization command
 print(f"Quantizing model to {quantization_type} format with command: {quantize_command}")
 result = subprocess.run(quantize_command, shell=True, text=True)
 if result.returncode != 0:
 print(f"Error during quantization to {quantization_type}. Check {log_file} for details.")
 else:
 print(f"Successfully quantized model to {quantization_type} and saved as {output_model}.")


In [14]:
def run_command(command):
 """Function to run a command and capture output"""
 print(f"Running command: {command}")
 result = subprocess.run(command, shell=True, capture_output=True, text=True)
 if result.returncode != 0:
 print(f"Error executing command: {result.stderr}")
 return result.stdout


In [15]:
def extract_perplexity(output):
 """extract perplexity from the output"""
 match = re.search(r"Final estimate: PPL = ([\d.]+)", output)
 if match:
 return float(match.group(1))
 return None


In [16]:
def build_command(model, output_file, ppl_file, config_params, threads=8, batch_size=512, ubatch_size=128):
 """Build the perplexity command based on the provided parameters."""
 command_parts = [
 "/Users/macdev/Downloads/build/bin/llama-perplexity",
 f"-m {model}",
 f"-f {ppl_file}",
 "--perplexity",
 ]

 # Add parameters only if they are set in config_params
 if config_params.get('ctx_size') is not None:
 command_parts.append(f"--ctx-size {config_params['ctx_size']}")
 if config_params.get('rope_freq_base') is not None:
 command_parts.append(f"--rope-freq-base {config_params['rope_freq_base']}")
 if config_params.get('rope_freq_scale') is not None:
 command_parts.append(f"--rope-freq-scale {config_params['rope_freq_scale']}")
 if config_params.get('rope_scaling_type') is not None:
 command_parts.append(f"--rope-scaling {config_params['rope_scaling_type']}")

 # Add sampling-related parameters if they are set
 if config_params.get('top_p') is not None:
 command_parts.append(f"--top-p {config_params['top_p']}")
 if config_params.get('repeat_penalty') is not None:
 command_parts.append(f"--repeat-penalty {config_params['repeat_penalty']}")
 if config_params.get('repeat_last_n') is not None:
 command_parts.append(f"--repeat-last-n {config_params['repeat_last_n']}")

 # Do not include `temp` as it's set to 0 in `get_parameters` if `use_temp` is False
 # Only add if temp is non-zero (if `use_temp` is True in get_parameters)
 if config_params.get('temp') is not None and config_params['temp'] != 0:
 command_parts.append(f"--temp {config_params['temp']}")

 # Add fixed parameters for threads and batch sizes
 command_parts.extend([
 f"--threads {threads}",
 f"--batch-size {batch_size}",
 f"--ubatch-size {ubatch_size}",
 ])

 # Redirect output to file
 command = " ".join(command_parts) + f" > {output_file} 2>&1"
 return command


In [17]:
# Measure perplexity for the base model
if os.path.exists(f'perplexity_{base_precision}.txt'):
 with open(base_perplexity_file, 'r') as file:
 base_output = file.read()
else:
 base_command = build_command(base_model, base_perplexity_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size= ubatch_size)
 base_output = run_command(base_command)
base_perplexity = extract_perplexity(base_output)
calculated_perplexity_recently = False # This will be set to True later

In [26]:
# Quantize the models
for quant in quantization_types:
 quantize_model(
 quantization_type=quant,
 base_model=base_model,
 base_model_name=base_model_name,
 path_to_llamacpp=path_to_llamacpp,
 imatrix_path=imatrix_path,
 use_leave_output_tensor=use_leave_output_tensor,
 )

Quantizing model to IQ2_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf "./salamandra-2b-instruct_IQ2_S.gguf" IQ2_S > "./IQ2_S_log.txt" 2>&1
Successfully quantized model to IQ2_S and saved as ./salamandra-2b-instruct_IQ2_S.gguf.
Quantizing model to IQ2_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf "./salamandra-2b-instruct_IQ2_M.gguf" IQ2_M > "./IQ2_M_log.txt" 2>&1
Successfully quantized model to IQ2_M and saved as ./salamandra-2b-instruct_IQ2_M.gguf.
Quantizing model to IQ3_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b-instruct_bf16.gguf "./salamandra-2b-instruct_IQ3_M.gguf" IQ3_M > "./IQ3_M_log.txt" 2>&1
Successfully quantized model to IQ3_M and saved as ./sal

In [16]:
# Measure perplexity for each quantized model
perplexity_results = dict()
perplexity_results[base_precision] = base_perplexity
for quant in quantization_types:
 calculated_perplexity_recently = True
 
 model = f"{base_model_name}_{quant}.gguf"
 output_file = f"perplexity_{quant}.txt"

 command = build_command(model, output_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size= ubatch_size)
 output = run_command(command)

 perplexity = extract_perplexity(output)
 perplexity_results[quant] = perplexity

Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ2_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ2_M.txt 2>&1
Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ3_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_M.txt 2>&1
Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ4_NL.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_NL.txt 2>&1
Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b-instruct_IQ4_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq

In [None]:
# load previous measurements if we didnt just measure perplexity for each quantized model
if not calculated_perplexity_recently:
 perplexity_results = dict()
 perplexity_results[base_precision] = base_perplexity

 for quant in quantization_types:
 output_file = f"perplexity_{quant}.txt"
 try:
 with open(output_file, 'r') as file:
 output = file.read()
 perplexity = extract_perplexity(output)
 except FileNotFoundError:
 print(f"Output file {output_file} not found.")
 perplexity = None

 perplexity_results[quant] = perplexity

 # Calculate ln(PPL(Q)/PPL(fp16)) and generate the table
 print("\nPerplexity Comparison Table:")
 print(f"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}")
 print("=" * 55)
 for quant, ppl in perplexity_results.items():
 if ppl and base_perplexity:
 ln_ratio = round(math.log(ppl / base_perplexity), 6)
 print(f"{quant:<20} {ppl:<10} {ln_ratio:<25}")

 print(perplexity_results)


In [None]:
# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table
print("\nPerplexity Comparison Table:")
print(f"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}")
print("=" * 55)
for quant, ppl in perplexity_results.items():
 if ppl and base_perplexity:
 ln_ratio = round(math.log(ppl / base_perplexity), 6)
 print(f"{quant:<20} {ppl:<10} {ln_ratio:<25}")

In [18]:
perplexity_results = dict()
perplexity_results[base_precision] = base_perplexity

for quant in quantization_types:
 output_file = f"perplexity_{quant}.txt"
 try:
 with open(output_file, 'r') as file:
 output = file.read()
 perplexity = extract_perplexity(output)
 except FileNotFoundError:
 print(f"Output file {output_file} not found.")
 perplexity = None

 perplexity_results[quant] = perplexity

# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table
print("\nPerplexity Comparison Table:")
print(f"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}")
print("=" * 55)
for quant, ppl in perplexity_results.items():
 if ppl and base_perplexity:
 ln_ratio = round(math.log(ppl / base_perplexity), 6)
 print(f"{quant:<20} {ppl:<10} {ln_ratio:<25}")

print(perplexity_results)



Perplexity Comparison Table:
Quantization Type PPL(Q) ln(PPL(Q)/PPL(fp16)) 
bf16 15.3799 0.0 
IQ2_S 25.3893 0.501266 
IQ2_M 21.6684 0.342794 
IQ3_M 16.774 0.086769 
IQ4_NL 15.9602 0.037037 
IQ4_XS 15.9591 0.036968 
Q3_K_L 16.5067 0.070705 
Q3_K_M 16.8567 0.091687 
Q4_K_M 15.8651 0.03106 
Q4_K_S 15.9346 0.035431 
Q5_K_M 15.4746 0.006139 
Q5_K_S 15.4901 0.00714 
Q6_K 15.3961 0.001053 
Q8_0 15.3831 0.000208 
{'bf16': 15.3799, 'IQ2_S': 25.3893, 'IQ2_M': 21.6684, 'IQ3_M': 16.774, 'IQ4_NL': 15.9602, 'IQ4_XS': 15.9591, 'Q3_K_L': 16.5067, 'Q3_K_M': 16.8567, 'Q4_K_M': 15.8651, 'Q4_K_S': 15.9346, 'Q5_K_M': 15.4746, 'Q5_K_S': 15.4901, 'Q6_K': 15.3961, 'Q8_0': 15.3831}
