Hunzla commited on
Commit
8fd601d
β€’
1 Parent(s): 12683af

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -0
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from datasets import load_dataset
4
+ from transformers import (
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ BitsAndBytesConfig,
8
+ HfArgumentParser,
9
+ TrainingArguments,
10
+ pipeline,
11
+ logging,
12
+ )
13
+ from peft import LoraConfig, PeftModel
14
+ from trl import SFTTrainer
15
+ # The model that you want to train from the Hugging Face hub
16
+ model_name = "meta-llama/Llama-2-7b-chat-hf"
17
+
18
+ # The instruction dataset to use
19
+ dataset_name = "mlabonne/guanaco-llama2-1k"
20
+
21
+ # Fine-tuned model name
22
+ new_model = "llama-2-7b-miniguanaco"
23
+
24
+ ################################################################################
25
+ # QLoRA parameters
26
+ ################################################################################
27
+
28
+ # LoRA attention dimension
29
+ lora_r = 64
30
+
31
+ # Alpha parameter for LoRA scaling
32
+ lora_alpha = 16
33
+
34
+ # Dropout probability for LoRA layers
35
+ lora_dropout = 0.1
36
+
37
+ ################################################################################
38
+ # bitsandbytes parameters
39
+ ################################################################################
40
+
41
+ # Activate 4-bit precision base model loading
42
+ use_4bit = True
43
+
44
+ # Compute dtype for 4-bit base models
45
+ bnb_4bit_compute_dtype = "float16"
46
+
47
+ # Quantization type (fp4 or nf4)
48
+ bnb_4bit_quant_type = "nf4"
49
+
50
+ # Activate nested quantization for 4-bit base models (double quantization)
51
+ use_nested_quant = False
52
+
53
+ ################################################################################
54
+ # TrainingArguments parameters
55
+ ################################################################################
56
+
57
+ # Output directory where the model predictions and checkpoints will be stored
58
+ output_dir = "./results"
59
+
60
+ # Number of training epochs
61
+ num_train_epochs = 1
62
+
63
+ # Enable fp16/bf16 training (set bf16 to True with an A100)
64
+ fp16 = False
65
+ bf16 = False
66
+
67
+ # Batch size per GPU for training
68
+ per_device_train_batch_size = 4
69
+
70
+ # Batch size per GPU for evaluation
71
+ per_device_eval_batch_size = 4
72
+
73
+ # Number of update steps to accumulate the gradients for
74
+ gradient_accumulation_steps = 1
75
+
76
+ # Enable gradient checkpointing
77
+ gradient_checkpointing = True
78
+
79
+ # Maximum gradient normal (gradient clipping)
80
+ max_grad_norm = 0.3
81
+
82
+ # Initial learning rate (AdamW optimizer)
83
+ learning_rate = 2e-4
84
+
85
+ # Weight decay to apply to all layers except bias/LayerNorm weights
86
+ weight_decay = 0.001
87
+
88
+ # Optimizer to use
89
+ optim = "paged_adamw_32bit"
90
+
91
+ # Learning rate schedule (constant a bit better than cosine)
92
+ lr_scheduler_type = "constant"
93
+
94
+ # Number of training steps (overrides num_train_epochs)
95
+ max_steps = -1
96
+
97
+ # Ratio of steps for a linear warmup (from 0 to learning rate)
98
+ warmup_ratio = 0.03
99
+
100
+ # Group sequences into batches with same length
101
+ # Saves memory and speeds up training considerably
102
+ group_by_length = True
103
+
104
+ # Save checkpoint every X updates steps
105
+ save_steps = 25
106
+
107
+ # Log every X updates steps
108
+ logging_steps = 25
109
+
110
+ ################################################################################
111
+ # SFT parameters
112
+ ################################################################################
113
+
114
+ # Maximum sequence length to use
115
+ max_seq_length = None
116
+
117
+ # Pack multiple short examples in the same input sequence to increase efficiency
118
+ packing = False
119
+
120
+ # Load the entire model on the GPU 0
121
+ device_map = {"": 0}
122
+ # Load dataset (you can process it here)
123
+ dataset = load_dataset(dataset_name, split="train")
124
+
125
+ # Load tokenizer and model with QLoRA configuration
126
+ compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
127
+
128
+ bnb_config = BitsAndBytesConfig(
129
+ load_in_4bit=use_4bit,
130
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
131
+ bnb_4bit_compute_dtype=compute_dtype,
132
+ bnb_4bit_use_double_quant=use_nested_quant,
133
+ )
134
+
135
+ # Check GPU compatibility with bfloat16
136
+ if compute_dtype == torch.float16 and use_4bit:
137
+ major, _ = torch.cuda.get_device_capability()
138
+ if major >= 8:
139
+ print("=" * 80)
140
+ print("Your GPU supports bfloat16: accelerate training with bf16=True")
141
+ print("=" * 80)
142
+
143
+ # Load base model
144
+ model = AutoModelForCausalLM.from_pretrained(
145
+ model_name,
146
+ quantization_config=bnb_config,
147
+ device_map=device_map
148
+ )
149
+ model.config.use_cache = False
150
+ model.config.pretraining_tp = 1
151
+
152
+ # Load LLaMA tokenizer
153
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
154
+ tokenizer.pad_token = tokenizer.eos_token
155
+ tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
156
+
157
+ # Load LoRA configuration
158
+ peft_config = LoraConfig(
159
+ lora_alpha=lora_alpha,
160
+ lora_dropout=lora_dropout,
161
+ r=lora_r,
162
+ bias="none",
163
+ task_type="CAUSAL_LM",
164
+ )
165
+
166
+ # Set training parameters
167
+ training_arguments = TrainingArguments(
168
+ output_dir=output_dir,
169
+ num_train_epochs=num_train_epochs,
170
+ per_device_train_batch_size=per_device_train_batch_size,
171
+ gradient_accumulation_steps=gradient_accumulation_steps,
172
+ optim=optim,
173
+ save_steps=save_steps,
174
+ logging_steps=logging_steps,
175
+ learning_rate=learning_rate,
176
+ weight_decay=weight_decay,
177
+ fp16=fp16,
178
+ bf16=bf16,
179
+ max_grad_norm=max_grad_norm,
180
+ max_steps=max_steps,
181
+ warmup_ratio=warmup_ratio,
182
+ group_by_length=group_by_length,
183
+ lr_scheduler_type=lr_scheduler_type,
184
+ report_to="tensorboard"
185
+ )
186
+
187
+ # Set supervised fine-tuning parameters
188
+ trainer = SFTTrainer(
189
+ model=model,
190
+ train_dataset=dataset,
191
+ peft_config=peft_config,
192
+ dataset_text_field="text",
193
+ max_seq_length=max_seq_length,
194
+ tokenizer=tokenizer,
195
+ args=training_arguments,
196
+ packing=packing,
197
+ )
198
+
199
+ # Train model
200
+ trainer.train()
201
+
202
+ # Save trained model
203
+ trainer.model.save_pretrained(new_model)
204
+ # Ignore warnings
205
+ logging.set_verbosity(logging.CRITICAL)
206
+
207
+ # Run text generation pipeline with our next model
208
+ prompt = "What is a large language model?"
209
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
210
+ result = pipe(f"<s>[INST] {prompt} [/INST]")
211
+ print(result[0]['generated_text'])
212
+ # Reload model in FP16 and merge it with LoRA weights
213
+ base_model = AutoModelForCausalLM.from_pretrained(
214
+ model_name,
215
+ low_cpu_mem_usage=True,
216
+ return_dict=True,
217
+ torch_dtype=torch.float16,
218
+ device_map=device_map,
219
+ )
220
+ model = PeftModel.from_pretrained(base_model, new_model)
221
+ model = model.merge_and_unload()
222
+
223
+ # Reload tokenizer to save it
224
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
225
+ tokenizer.pad_token = tokenizer.eos_token
226
+ tokenizer.padding_side = "right"
227
+ kwargs={
228
+
229
+ }
230
+ model.push_to_hub(**kwargs)
231
+ tokenizer.push_to_hub(new_model, use_temp_dir=False)
232
+ def do_nothing(text):
233
+ return text
234
+ # Create Gradio interface
235
+ interface = gr.Interface(
236
+ fn=do_nothing,
237
+ inputs="text",
238
+ outputs="text",
239
+ layout="vertical",
240
+ title="LLAMA-2-7B Chatbot",
241
+ description="Enter a prompt and get a chatbot response.",
242
+ examples=[["Tell me a joke."]],
243
+ )
244
+
245
+ if __name__ == "__main__":
246
+ interface.launch()