VietnamAIHub
/

Vietnamese_llama1_30B_SFT

@@ -20,24 +20,42 @@ To load the fine-tuned Vietnamese Llama-30b model with LoRA adapters, follow the
 ```python
 import torch
-from transformers import AutoModelForCausalLM, LlamaTokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_name = "VietnamAIHub/Vietnamese_SFT_llama_30B_v1"
 cache_dir="/save_weight_path"
-## Loading Base LLaMa model weight and Merge with Adapter Weight wiht the base model
 m = AutoModelForCausalLM.from_pretrained(
     model_name,
-    torch_dtype=torch.bfloat16,
-    device_map={"cuda": 0},
     cache_dir=cache_dir
 )
-## Save model to specific path
-tok = LlamaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-## Loading Unified Model Again after Merging the Weight
 tok.bos_token_id = 1
 generation_config = dict(
         temperature=0.2,
@@ -46,15 +64,25 @@ generation_config = dict(
         do_sample=True,
         num_beams=1,
         repetition_penalty=1.2,
-        max_new_tokens=400,
         early_stopping=True,
     )
-prompt="Cách để học tập về một môn học thật tốt"
-_DEFAULT_TEMPLATE=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{prompt}\n\n### response:\n"
-inputs = tok(message,return_tensors="pt")  #add_special_tokens=False ?
 generation_output = m.generate(
     input_ids = inputs["input_ids"].to(device),
     attention_mask = inputs['attention_mask'].to(device),
@@ -62,11 +90,13 @@ generation_output = m.generate(
     pad_token_id=tok.pad_token_id,
     **generation_config
 )
 generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
 s = generation_output[0]
 output = tok.decode(s,skip_special_tokens=True)
 response = output.split("### response:")[1].strip()
 print(respone)
 ```
 ## Conclusion

 ```python
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_name = "VietnamAIHub/Vietnamese_llama_30B_SFT"
 cache_dir="/save_weight_path"
+## Loading LLaMa model weight
 m = AutoModelForCausalLM.from_pretrained(
     model_name,
+    load_in_8bit=True,
+    trust_remote_code=True,
     cache_dir=cache_dir
 )
+## Loading Tokenizer
+tok = AutoTokenizer.from_pretrained(
+    model_name,
+    padding_side="right",
+    use_fast=False, # Fast tokenizer giving issues.
+    tokenizer_type='llama', #if 'llama' in args.model_name_or_path else None, # Needed for HF name change
+    use_auth_token=True,
+    cache_dir=cache_dir)
 tok.bos_token_id = 1
+stop_token_ids = [0]
+## Setting Stopping Criteria
+class StopOnTokens(StoppingCriteria):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stop_id in stop_token_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+stop = StopOnTokens()
+streamer = TextIteratorStreamer(tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
 generation_config = dict(
         temperature=0.2,
         do_sample=True,
         num_beams=1,
         repetition_penalty=1.2,
+        max_new_tokens=1024,
         early_stopping=True,
+       stopping_criteria=StoppingCriteriaList([stop]),
+      streamer=streamer,
     )
+## Set your Input with System Prompt
+input_prompt="Cách để học tập về một môn học thật tốt"
+system_prompt=f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### prompt:\n{input_prompt}\n\n### response:\n"
+inputs = tok(system_prompt,return_tensors="pt")  #add_special_tokens=False ?
+input_ids = input_ids.to(device)
 generation_output = m.generate(
     input_ids = inputs["input_ids"].to(device),
     attention_mask = inputs['attention_mask'].to(device),
     pad_token_id=tok.pad_token_id,
     **generation_config
 )
 generation_output_ = m.generate(input_ids = inputs["input_ids"].to(device), **generation_config)
 s = generation_output[0]
 output = tok.decode(s,skip_special_tokens=True)
 response = output.split("### response:")[1].strip()
 print(respone)
 ```
 ## Conclusion