In [24]:
from peft import PeftModel

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

### Load the base model

In [17]:
model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2", torch_dtype=torch.float16, device_map="auto")

In [8]:
tokenizer = AutoTokenizer.from_pretrained("ashrielbrian/openllama_3b_v2-teknium-GPT4-LLM-Cleaned")

In [18]:
inp = tokenizer("write a function that takes in two integers, and returns its modulo.", return_tensors="pt")


In [19]:
inp

{'input_ids': tensor([[ 1, 2786, 260, 1155, 347, 2976, 293, 846, 1146, 6014,
 29522, 295, 5729, 737, 966, 19795, 29520]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
inputs = {k: v.to("cuda") for k, v in inp.items()}
inputs

{'input_ids': tensor([[ 1, 2786, 260, 1155, 347, 2976, 293, 846, 1146, 6014,
 29522, 295, 5729, 737, 966, 19795, 29520]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [22]:
with torch.no_grad():
 generate_ids = model.generate(**inputs, max_length=1000)

In [23]:
# the output is practically gibberish because it was trained as a completion model, and is NOT instruction-tuned.
outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
outputs


'write a function that takes in two integers, and returns its modulo.\nFor example, if you have the integers 10 and 2, then the modulo of 10 and 2 is 2.\nThe modulo of 10 and 3 is 3.\nThe modulo of 10 and 4 is 4.\nThe modulo of 10 and 5 is 5.\nThe modulo of 10 and 6 is 6.\nThe modulo of 10 and 7 is 7.\nThe modulo of 10 and 8 is 8.\nThe modulo of 10 and 9 is 9.\nThe modulo of 10 and 10 is 10.\nThe modulo of 10 and 11 is 11.\nThe modulo of 10 and 12 is 12.\nThe modulo of 10 and 13 is 13.\nThe modulo of 10 and 14 is 14.\nThe modulo of 10 and 15 is 15.\nThe modulo of 10 and 16 is 16.\nThe modulo of 10 and 17 is 17.\nThe modulo of 10 and 18 is 18.\nThe modulo of 10 and 19 is 19.\nThe modulo of 10 and 20 is 20.\nThe modulo of 10 and 21 is 21.\nThe modulo of 10 and 22 is 22.\nThe modulo of 10 and 23 is 23.\nThe modulo of 10 and 24 is 24.\nThe modulo of 10 and 25 is 25.\nThe modulo of 10 and 26 is 26.\nThe modulo of 10 and 27 is 27.\nThe modulo of 10 and 28 is 28.\nThe modulo of 10 and 29 is 2

### Load adapters without merging

In [32]:
peft_model_id = "ashrielbrian/openllama_3b_v2-teknium-GPT4-LLM-Cleaned"
model = PeftModel.from_pretrained(model, peft_model_id)

In [35]:
with torch.no_grad():
 generate_ids = model.generate(**inputs, max_length=1000)

outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
outputs

'write a function that takes in two integers, and returns its modulo.\n\nHere is one way to write the function in Python:\n\n```python\ndef modulo(a, b):\n return a % b\n```\n\nThis function takes in two arguments, `a` and `b`, and returns the remainder of `a` when divided by `b`. The remainder is the value that remains after the division.'

### Merge the adapter into the base model
Helpful resource on [Huggingface](https://huggingface.co/docs/peft/main/en/developer_guides/lora).

In [36]:
model.merge_and_unload()

PeftModelForCausalLM(
 (base_model): LoraModel(
 (model): LlamaForCausalLM(
 (model): LlamaModel(
 (embed_tokens): Embedding(32000, 3200, padding_idx=0)
 (layers): ModuleList(
 (0-25): 26 x LlamaDecoderLayer(
 (self_attn): LlamaSdpaAttention(
 (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
 (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
 (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
 (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
 (rotary_emb): LlamaRotaryEmbedding()
 )
 (mlp): LlamaMLP(
 (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
 (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
 (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
 (act_fn): SiLU()
 )
 (input_layernorm): LlamaRMSNorm()
 (post_attention_layernorm): LlamaRMSNorm()
 )
 )
 (norm): LlamaRMSNorm()
 )
 (lm_head): Linear(in_features=3200, out_features=32000, bias=False)
 )
 )
)

In [37]:
with torch.no_grad():
 generate_ids = model.generate(**inputs, max_length=1000)

# inference latency here is lower than if we kept the adapter separate as in the previous step
# comparing walltime between the unmerged adapter model, with the merged LORA weights, are 1.3s and 0.9s respectively.
outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
outputs

'write a function that takes in two integers, and returns its modulo.\n\nHere is one way to write the function in Python:\n\n```python\ndef modulo(a, b):\n return a % b\n```\n\nThis function takes in two arguments, `a` and `b`, and returns the remainder of `a` when divided by `b`. The remainder is the value that remains after the division.'