VietnamAIHub commited on
Commit
7118e65
1 Parent(s): a5519c6
Files changed (1) hide show
  1. README.md +43 -13
README.md CHANGED
@@ -20,24 +20,42 @@ To load the fine-tuned Vietnamese Llama-30b model with LoRA adapters, follow the
20
 
21
  ```python
22
  import torch
23
- from transformers import AutoModelForCausalLM, LlamaTokenizer
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
 
26
- model_name = "VietnamAIHub/Vietnamese_SFT_llama_30B_v1"
27
  cache_dir="/save_weight_path"
28
- ## Loading Base LLaMa model weight and Merge with Adapter Weight wiht the base model
 
29
  m = AutoModelForCausalLM.from_pretrained(
30
  model_name,
31
- torch_dtype=torch.bfloat16,
32
- device_map={"cuda": 0},
33
  cache_dir=cache_dir
 
34
  )
35
 
36
- ## Save model to specific path
37
- tok = LlamaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
 
 
 
 
 
38
 
39
- ## Loading Unified Model Again after Merging the Weight
40
  tok.bos_token_id = 1
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  generation_config = dict(
43
  temperature=0.2,
@@ -46,15 +64,25 @@ generation_config = dict(
46
  do_sample=True,
47
  num_beams=1,
48
  repetition_penalty=1.2,
49
- max_new_tokens=400,
50
  early_stopping=True,
51
-
 
52
  )
53
 
54
- prompt="Cách để học tập về một môn học thật tốt"
55
- _DEFAULT_TEMPLATE=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{prompt}\n\n### response:\n"
56
 
57
- inputs = tok(message,return_tensors="pt") #add_special_tokens=False ?
 
 
 
 
 
 
 
 
 
 
 
58
  generation_output = m.generate(
59
  input_ids = inputs["input_ids"].to(device),
60
  attention_mask = inputs['attention_mask'].to(device),
@@ -62,11 +90,13 @@ generation_output = m.generate(
62
  pad_token_id=tok.pad_token_id,
63
  **generation_config
64
  )
 
65
  generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
66
  s = generation_output[0]
67
  output = tok.decode(s,skip_special_tokens=True)
68
  response = output.split("### response:")[1].strip()
69
  print(respone)
 
70
  ```
71
 
72
  ## Conclusion
 
20
 
21
  ```python
22
  import torch
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
 
26
+ model_name = "VietnamAIHub/Vietnamese_llama_30B_SFT"
27
  cache_dir="/save_weight_path"
28
+
29
+ ## Loading LLaMa model weight
30
  m = AutoModelForCausalLM.from_pretrained(
31
  model_name,
32
+ load_in_8bit=True,
33
+ trust_remote_code=True,
34
  cache_dir=cache_dir
35
+
36
  )
37
 
38
+ ## Loading Tokenizer
39
+ tok = AutoTokenizer.from_pretrained(
40
+ model_name,
41
+ padding_side="right",
42
+ use_fast=False, # Fast tokenizer giving issues.
43
+ tokenizer_type='llama', #if 'llama' in args.model_name_or_path else None, # Needed for HF name change
44
+ use_auth_token=True,
45
+ cache_dir=cache_dir)
46
 
 
47
  tok.bos_token_id = 1
48
+ stop_token_ids = [0]
49
+
50
+ ## Setting Stopping Criteria
51
+ class StopOnTokens(StoppingCriteria):
52
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
53
+ for stop_id in stop_token_ids:
54
+ if input_ids[0][-1] == stop_id:
55
+ return True
56
+ return False
57
+ stop = StopOnTokens()
58
+ streamer = TextIteratorStreamer(tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
59
 
60
  generation_config = dict(
61
  temperature=0.2,
 
64
  do_sample=True,
65
  num_beams=1,
66
  repetition_penalty=1.2,
67
+ max_new_tokens=1024,
68
  early_stopping=True,
69
+ stopping_criteria=StoppingCriteriaList([stop]),
70
+ streamer=streamer,
71
  )
72
 
 
 
73
 
74
+
75
+
76
+ ## Set your Input with System Prompt
77
+
78
+ input_prompt="Cách để học tập về một môn học thật tốt"
79
+ system_prompt=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{input_prompt}\n\n### response:\n"
80
+
81
+
82
+ inputs = tok(system_prompt,return_tensors="pt") #add_special_tokens=False ?
83
+ input_ids = input_ids.to(device)
84
+
85
+
86
  generation_output = m.generate(
87
  input_ids = inputs["input_ids"].to(device),
88
  attention_mask = inputs['attention_mask'].to(device),
 
90
  pad_token_id=tok.pad_token_id,
91
  **generation_config
92
  )
93
+
94
  generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
95
  s = generation_output[0]
96
  output = tok.decode(s,skip_special_tokens=True)
97
  response = output.split("### response:")[1].strip()
98
  print(respone)
99
+
100
  ```
101
 
102
  ## Conclusion