--- tags: - text-generation - meta-llama/Llama-2-7b-chat-hf inference: false datasets: - samsum library_name: peft --- install transformers, peft, accelerate & BitsAndBytes ```python import transformers from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer import torch from torch import cuda, bfloat16 base_model_id = 'meta-llama/Llama-2-7b-chat-hf' device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' bnb_config = transformers.BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=bfloat16 ) hf_auth = "your-hf-access-token" model_config = transformers.AutoConfig.from_pretrained( base_model_id, use_auth_token=hf_auth ) model = transformers.AutoModelForCausalLM.from_pretrained( base_model_id, trust_remote_code=True, config=model_config, quantization_config=bnb_config, device_map='auto', use_auth_token=hf_auth ) config = PeftConfig.from_pretrained("Ashishkr/llama2-call-summarization") model = PeftModel.from_pretrained(model, "Ashishkr/llama2-call-summarization").to(device) model.eval() print(f"Model loaded on {device}") tokenizer = transformers.AutoTokenizer.from_pretrained( base_model_id, use_auth_token=hf_auth ) ``` ```python def llama_generate( model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompt: str, max_new_tokens: int = 128, temperature: float = 0.92): inputs = tokenizer( [prompt], return_tensors="pt", return_token_type_ids=False, ).to( device ) # Check if bfloat16 is supported, otherwise use float16 dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 with torch.autocast("cuda", dtype=dtype_to_use): response = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, return_dict_in_generate=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) decoded_output = tokenizer.decode( response["sequences"][0], skip_special_tokens=True, ) return decoded_output[len(prompt) :] prompt = """ instruction: "summarize this conversation :" \n input: "Oli: I've talked to some people from the third year Jacob: About the statistics exam? Marcia: What did they say? Oli: Yeah, about the exam Oli: We need to prepare for a battle Jacob: So it will be difficult Oli: They said it was the hardest exam ever Marcia: 😱 Oli: The questions were displayed on the screen Oli: One minute per question and it disappears Oli: They won't come back so if you didn't get your answer you're fucked Marcia: So we need to make the calculations really fast Jacob: That's insane Oli: I know Oli: Very stressful Marcia: How are we even supposed to study for it? Marcia: With a timer? Oli: I guess Marcia: Did anybody pass it last year Oli: Some people did, but the majority had to take the second or even the third chance"\n response: """ response = llama_generate( model, tokenizer, prompt, max_new_tokens=100, temperature=0.9, ).split("")[0].strip() print(response) ```