LeonardPuettmann
/

LlaMaestra-3.2-1B-Instruct-v0.1-4bit

text-generation

text-generation-inference

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

LeonardPuettmann commited on 4 days ago

Commit

0a67e9e

•

1 Parent(s): 89bd038

Update README.md

Files changed (1) hide show

README.md +7 -17

README.md CHANGED Viewed

@@ -31,38 +31,28 @@ Due to its size, the model runs very well on CPUs.
 ```python
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-from peft import PeftModel
-base_model_id = "unsloth/Llama-3.2-1B-Instruct"
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_id,  # Mistral, same as before
-    quantization_config=bnb_config,  # Same quantization config as before
     device_map="auto",
     trust_remote_code=True,
 )
-tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)
-ft_model = PeftModel.from_pretrained(base_model, "LeonardPuettmann/LlaMaestra-3.2-1B-Instruct-v0.1-4bit")
 row_json = [
     {"role": "system", "content": "Your job is to return translations for sentences or words from either Italian to English or English to Italian."},
-    {"role": "user", "content": "Scontri a Bologna, la destra lancia l'offensiva contro i centri sociali."}
 ]
 prompt =  tokenizer.apply_chat_template(row_json, tokenize=False)
 model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
 with torch.no_grad():
-    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=1024)[0]))
 ```
 ## Data used

 ```python
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+model_id = "LeonardPuettmann/LlaMaestra-3.2-1B-Instruct-v0.1-4bit"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
     device_map="auto",
     trust_remote_code=True,
 )
+tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, trust_remote_code=True)
 row_json = [
     {"role": "system", "content": "Your job is to return translations for sentences or words from either Italian to English or English to Italian."},
+    {"role": "user", "content": "Do you sell tickets for the bus?"},
 ]
 prompt =  tokenizer.apply_chat_template(row_json, tokenize=False)
 model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
 with torch.no_grad():
+    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=1024)[0]))
 ```
 ## Data used