# Poro 34B GPTQ quantization

## Step 1: Import transformers libraries and check the CUDA availability

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

In [2]:
import torch

In [4]:
torch.__version__

'2.2.1+cu121'

In [5]:
torch.cuda.is_available()

True

## Step 2: Load the original Poro 34B model from Huggingface and save it locally

In [3]:
model_name='LumiOpen/Poro-34B'

In [4]:
org_tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/5.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

In [None]:
branch = "1000B"
org_model = AutoModelForCausalLM.from_pretrained(model_name,
    torch_dtype=torch.bfloat16,
    revision=branch,
)

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/57.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/4.52G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [6]:
model_configuration = org_model.config

In [7]:
# original model configuration is missing the sequence length parameter
model_configuration.sequence_length = 2048

In [8]:
# Poro 34B is saved locally (this is not required but provides faster processing if there is a need for multiple runs)
org_model.save_pretrained("Poro-34B", max_shard_size="5GB",safe_serialization=True)
org_tokenizer.save_pretrained("Poro-34B")

('Poro-34B/tokenizer_config.json',
 'Poro-34B/special_tokens_map.json',
 'Poro-34B/tokenizer.json')

## Step 3: Fine-tuned parameters are loaded from local Poro-34B-Lora-185 directory and merged

In [9]:
from peft import PeftModel

In [10]:
model_id2 = "Poro-34B-Lora-185"

In [12]:
loaded_model = PeftModel.from_pretrained(org_model,model_id2,is_trainable=True)

In [13]:
# Fine-tuned weights are merged to original Poro 34B model
merged_model = loaded_model.merge_and_unload()

In [14]:
# Merged model is saved locally
merged_model.save_pretrained("Poro-34B-185c", max_shard_size="5GB",safe_serialization=True)
org_tokenizer.save_pretrained("Poro-34B-185c")

('Poro-34B-185c/tokenizer_config.json',
 'Poro-34B-185c/special_tokens_map.json',
 'Poro-34B-185c/tokenizer.json')

## Step 4: GPTQ quantization is applied to merged fine-tuned model

In [15]:
model_id = "Poro-34B-185c"

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [17]:
# Dataset is a list of strings, we have here only one string to show the process
dataset = ["Peruuta ensin vanhaan osoitteeseen tilattu uutiskirje kirjeen alareunan “Peruuta tilaus” -linkistä.\nTilaa uutiskirje uudelleen oikeaan osoitteeseen."]

In [18]:
gptq_config = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config,low_cpu_mem_usage=True)

In [None]:
# Quantized model and tokenizer are saved locally
model.save_pretrained("Poro-34B-GPTQ-SGroup", use_safetensors=True)
tokenizer.save_pretrained("Poro-34B-GPTQ-SGroup")

In [None]:
# Login to Huggingface
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Quantized model and tokenizer are saved to Huggingface
model.push_to_hub("Poro-34B-GPTQ-SGroup", use_safetensors=True)
tokenizer.push_to_hub("Poro-34B-GPTQ-SGroup")