Leonard Püttmann
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -31,38 +31,28 @@ Due to its size, the model runs very well on CPUs.
|
|
31 |
|
32 |
```python
|
33 |
import torch
|
34 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
35 |
-
from peft import PeftModel
|
36 |
|
37 |
-
|
38 |
-
bnb_config = BitsAndBytesConfig(
|
39 |
-
load_in_4bit=True,
|
40 |
-
bnb_4bit_use_double_quant=True,
|
41 |
-
bnb_4bit_quant_type="nf4",
|
42 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
43 |
-
)
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
quantization_config=bnb_config, # Same quantization config as before
|
48 |
device_map="auto",
|
49 |
trust_remote_code=True,
|
50 |
)
|
51 |
|
52 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
53 |
-
|
54 |
-
ft_model = PeftModel.from_pretrained(base_model, "LeonardPuettmann/LlaMaestra-3.2-1B-Instruct-v0.1-4bit")
|
55 |
|
56 |
row_json = [
|
57 |
{"role": "system", "content": "Your job is to return translations for sentences or words from either Italian to English or English to Italian."},
|
58 |
-
{"role": "user", "content": "
|
59 |
]
|
60 |
|
61 |
prompt = tokenizer.apply_chat_template(row_json, tokenize=False)
|
62 |
model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
|
63 |
|
64 |
with torch.no_grad():
|
65 |
-
print(tokenizer.decode(
|
66 |
```
|
67 |
|
68 |
## Data used
|
|
|
31 |
|
32 |
```python
|
33 |
import torch
|
34 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
35 |
|
36 |
+
model_id = "LeonardPuettmann/LlaMaestra-3.2-1B-Instruct-v0.1-4bit"
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
model = AutoModelForCausalLM.from_pretrained(
|
39 |
+
model_id,
|
|
|
40 |
device_map="auto",
|
41 |
trust_remote_code=True,
|
42 |
)
|
43 |
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, trust_remote_code=True)
|
|
|
|
|
45 |
|
46 |
row_json = [
|
47 |
{"role": "system", "content": "Your job is to return translations for sentences or words from either Italian to English or English to Italian."},
|
48 |
+
{"role": "user", "content": "Do you sell tickets for the bus?"},
|
49 |
]
|
50 |
|
51 |
prompt = tokenizer.apply_chat_template(row_json, tokenize=False)
|
52 |
model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
|
53 |
|
54 |
with torch.no_grad():
|
55 |
+
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=1024)[0]))
|
56 |
```
|
57 |
|
58 |
## Data used
|