ZX9966
/

bwx-13B-hf

@@ -57,35 +57,68 @@ Users (both direct and downstream) should be made aware of the risks, biases and
 Use the code below to get started with the model.
-```
 import torch
-import transformers
-from transformers import LlamaTokenizer, LlamaForCausalLM
-def generate_prompt(text):
-    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" +
-               ### Instruction:\n\n{text}\n\n### Response:\n\n"""
-tokenizer = LlamaTokenizer.from_pretrained('BlueWhaleX/bwx-13B-HF')
-model = LlamaForCausalLM.from_pretrained('BlueWhaleX/bwx-13B-HF').half().cuda()
-model.eval()
-text = '王国维说：“自周之衰，文王、周公势力之瓦解也，国民之智力成熟于内，政治之纷乱乘之于外，上无统一之制度，下迫于社会之要求，于是诸于九流各创其学说。” 他意在说明 A. 分封制的崩溃 B. 商鞅变法的作用 C. 兼并战争的后果 D. 百家争鸣的原因'
-prompt = generate_prompt(text)
-input_ids = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
-with torch.no_grad():
-    output_ids = model.generate(
-        input_ids=input_ids,
-        max_new_tokens=400,
-        temperature=0.2,
-        top_k=40,
-        top_p=0.9,
-        repetition_penalty=1.3
-    ).cuda()
-output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-response = output.split("### Response:")[1].strip()
-print("Response: ", response, '\n')
 ```

 Use the code below to get started with the model.
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
 import torch
+base_model_name = "BlueWhaleX/bwx-13B-hf"
+load_type = torch.float16
+device = None
+generation_config = dict(
+    temperature=0.2,
+    top_k=40,
+    top_p=0.9,
+    do_sample=True,
+    num_beams=1,
+    repetition_penalty=1.3,
+    max_new_tokens=400
+    )
+prompt_input = (
+    "Below is an instruction that describes a task. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n\n{instruction}\n\n### Response:\n\n"
+)
+if torch.cuda.is_available():
+    device = torch.device(0)
+else:
+    device = torch.device('cpu')
+def generate_prompt(instruction, input=None):
+    if input:
+        instruction = instruction + '\n' + input
+    return prompt_input.format_map({'instruction': instruction})
+tokenizer = LlamaTokenizer.from_pretrained(base_model_name)
+model = LlamaForCausalLM.from_pretrained(
+        base_model_name,
+        load_in_8bit=False,
+        torch_dtype=load_type,
+        low_cpu_mem_usage=True,
+        device_map='auto',
+        )
+model_vocab_size = model.get_input_embeddings().weight.size(0)
+tokenzier_vocab_size = len(tokenizer)
+if model_vocab_size != tokenzier_vocab_size:
+    model.resize_token_embeddings(tokenzier_vocab_size)
+raw_input_text = input("Input:")
+input_text = generate_prompt(instruction=raw_input_text)
+inputs = tokenizer(input_text, return_tensors="pt")
+generation_output = model.generate(
+input_ids=inputs["input_ids"].to(device),
+    attention_mask=inputs['attention_mask'].to(device),
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.pad_token_id,
+    **generation_config
+)
+s = generation_output[0]
+output = tokenizer.decode(s, skip_special_tokens=True)
+response = output.split("### Response:")[1].strip()
+print("Response: ", response)
+print("\n")
 ```