EIRTHAIMED
/

Llama-3.1-EIRAI-8B

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

AIEIR commited on Sep 16

Commit

20a87e2

•

1 Parent(s): 7ab009b

Update README.md

Files changed (1) hide show

README.md +27 -9

README.md CHANGED Viewed

@@ -287,17 +287,26 @@ FLAIR เนื้องอกไขมันที่เส้นกลาง
 ```python
-Using transformers.pipeline() API
-import transformers
 import torch
 model_id = "EIRTHAIMED/Llama-3.1-EIRAI-8B"
-pipeline = transformers.pipeline(
-    "text-generation",
-    model=model_id,
-    model_kwargs={"torch_dtype": torch.bfloat16},
     device_map="auto",
 )
 messages = [
@@ -305,8 +314,17 @@ messages = [
     {"role": "user", "content": "การใช้ clinical tracer มีบทบาทอย่างไรในการพัฒนาคุณภาพการดูแลผู้ป่วย?"}
 ]
-outputs = pipeline(messages, max_new_tokens=128, do_sample=True, temperature=0.01, top_k=100, top_p=0.95)
-print(outputs[0]["generated_text"][-1])
 ```

 ```python
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 model_id = "EIRTHAIMED/Llama-3.1-EIRAI-8B"
+nf4_config = BitsAndBytesConfig(
+   load_in_4bit=True,
+   bnb_4bit_quant_type="nf4",
+   bnb_4bit_use_double_quant=True,
+   bnb_4bit_compute_dtype=torch.bfloat16
+)
+# Load the base model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    # quantization_config=nf4_config, # uncomment this line for 4 bit loading
     device_map="auto",
+    attn_implementation="flash_attention_2"
 )
 messages = [
     {"role": "user", "content": "การใช้ clinical tracer มีบทบาทอย่างไรในการพัฒนาคุณภาพการดูแลผู้ป่วย?"}
 ]
+input = tokenizer.apply_chat_template(
+    messages,
+    tokenize = True,
+    add_generation_prompt = True, # Must add for generation
+    return_tensors = "pt",
+).to("cuda")
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer, skip_prompt = True)
+_ = model.generate(input, streamer = text_streamer, max_new_tokens = 1500, do_sample=True, temperature=0.01, top_k=100, top_p=0.95)
 ```