VietnamAIHub
commited on
Commit
•
7118e65
1
Parent(s):
a5519c6
update
Browse files
README.md
CHANGED
@@ -20,24 +20,42 @@ To load the fine-tuned Vietnamese Llama-30b model with LoRA adapters, follow the
|
|
20 |
|
21 |
```python
|
22 |
import torch
|
23 |
-
from transformers import AutoModelForCausalLM,
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
|
26 |
-
model_name = "VietnamAIHub/
|
27 |
cache_dir="/save_weight_path"
|
28 |
-
|
|
|
29 |
m = AutoModelForCausalLM.from_pretrained(
|
30 |
model_name,
|
31 |
-
|
32 |
-
|
33 |
cache_dir=cache_dir
|
|
|
34 |
)
|
35 |
|
36 |
-
##
|
37 |
-
tok =
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
## Loading Unified Model Again after Merging the Weight
|
40 |
tok.bos_token_id = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
generation_config = dict(
|
43 |
temperature=0.2,
|
@@ -46,15 +64,25 @@ generation_config = dict(
|
|
46 |
do_sample=True,
|
47 |
num_beams=1,
|
48 |
repetition_penalty=1.2,
|
49 |
-
max_new_tokens=
|
50 |
early_stopping=True,
|
51 |
-
|
|
|
52 |
)
|
53 |
|
54 |
-
prompt="Cách để học tập về một môn học thật tốt"
|
55 |
-
_DEFAULT_TEMPLATE=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{prompt}\n\n### response:\n"
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
generation_output = m.generate(
|
59 |
input_ids = inputs["input_ids"].to(device),
|
60 |
attention_mask = inputs['attention_mask'].to(device),
|
@@ -62,11 +90,13 @@ generation_output = m.generate(
|
|
62 |
pad_token_id=tok.pad_token_id,
|
63 |
**generation_config
|
64 |
)
|
|
|
65 |
generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
|
66 |
s = generation_output[0]
|
67 |
output = tok.decode(s,skip_special_tokens=True)
|
68 |
response = output.split("### response:")[1].strip()
|
69 |
print(respone)
|
|
|
70 |
```
|
71 |
|
72 |
## Conclusion
|
|
|
20 |
|
21 |
```python
|
22 |
import torch
|
23 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
|
26 |
+
model_name = "VietnamAIHub/Vietnamese_llama_30B_SFT"
|
27 |
cache_dir="/save_weight_path"
|
28 |
+
|
29 |
+
## Loading LLaMa model weight
|
30 |
m = AutoModelForCausalLM.from_pretrained(
|
31 |
model_name,
|
32 |
+
load_in_8bit=True,
|
33 |
+
trust_remote_code=True,
|
34 |
cache_dir=cache_dir
|
35 |
+
|
36 |
)
|
37 |
|
38 |
+
## Loading Tokenizer
|
39 |
+
tok = AutoTokenizer.from_pretrained(
|
40 |
+
model_name,
|
41 |
+
padding_side="right",
|
42 |
+
use_fast=False, # Fast tokenizer giving issues.
|
43 |
+
tokenizer_type='llama', #if 'llama' in args.model_name_or_path else None, # Needed for HF name change
|
44 |
+
use_auth_token=True,
|
45 |
+
cache_dir=cache_dir)
|
46 |
|
|
|
47 |
tok.bos_token_id = 1
|
48 |
+
stop_token_ids = [0]
|
49 |
+
|
50 |
+
## Setting Stopping Criteria
|
51 |
+
class StopOnTokens(StoppingCriteria):
|
52 |
+
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
53 |
+
for stop_id in stop_token_ids:
|
54 |
+
if input_ids[0][-1] == stop_id:
|
55 |
+
return True
|
56 |
+
return False
|
57 |
+
stop = StopOnTokens()
|
58 |
+
streamer = TextIteratorStreamer(tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
59 |
|
60 |
generation_config = dict(
|
61 |
temperature=0.2,
|
|
|
64 |
do_sample=True,
|
65 |
num_beams=1,
|
66 |
repetition_penalty=1.2,
|
67 |
+
max_new_tokens=1024,
|
68 |
early_stopping=True,
|
69 |
+
stopping_criteria=StoppingCriteriaList([stop]),
|
70 |
+
streamer=streamer,
|
71 |
)
|
72 |
|
|
|
|
|
73 |
|
74 |
+
|
75 |
+
|
76 |
+
## Set your Input with System Prompt
|
77 |
+
|
78 |
+
input_prompt="Cách để học tập về một môn học thật tốt"
|
79 |
+
system_prompt=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{input_prompt}\n\n### response:\n"
|
80 |
+
|
81 |
+
|
82 |
+
inputs = tok(system_prompt,return_tensors="pt") #add_special_tokens=False ?
|
83 |
+
input_ids = input_ids.to(device)
|
84 |
+
|
85 |
+
|
86 |
generation_output = m.generate(
|
87 |
input_ids = inputs["input_ids"].to(device),
|
88 |
attention_mask = inputs['attention_mask'].to(device),
|
|
|
90 |
pad_token_id=tok.pad_token_id,
|
91 |
**generation_config
|
92 |
)
|
93 |
+
|
94 |
generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
|
95 |
s = generation_output[0]
|
96 |
output = tok.decode(s,skip_special_tokens=True)
|
97 |
response = output.split("### response:")[1].strip()
|
98 |
print(respone)
|
99 |
+
|
100 |
```
|
101 |
|
102 |
## Conclusion
|