qgyd2021 commited on
Commit
3177298
1 Parent(s): 3081f71

[update]add code

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ **/flagged/
6
+ **/__pycache__/
examples/exercises/chinese_modern_poetry/1.prepare_data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from datasets import load_dataset
6
+
7
+ from project_settings import project_path
8
+
9
+
10
+ def get_args():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--dataset_path", default="Iess/chinese_modern_poetry", type=str)
13
+ parser.add_argument("--dataset_name", default=None, type=str)
14
+ parser.add_argument("--dataset_split", default=None, type=str)
15
+ parser.add_argument(
16
+ "--dataset_cache_dir",
17
+ default=(project_path / "hub_datasets").as_posix(),
18
+ type=str
19
+ )
20
+ args = parser.parse_args()
21
+ return args
22
+
23
+
24
+ def main():
25
+ args = get_args()
26
+
27
+ dataset = load_dataset(
28
+ path=args.dataset_path,
29
+ name=args.dataset_name,
30
+ split=args.dataset_split,
31
+ cache_dir=args.dataset_cache_dir
32
+ )
33
+ print(dataset)
34
+
35
+ return
36
+
37
+
38
+ if __name__ == '__main__':
39
+ main()
examples/exercises/chinese_modern_poetry/2.train_model.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from collections import defaultdict
5
+ from dataclasses import dataclass, field
6
+ import os
7
+ import platform
8
+ import sys
9
+ from typing import Optional
10
+
11
+ pwd = os.path.abspath(os.path.dirname(__file__))
12
+ sys.path.append(os.path.join(pwd, '../../../'))
13
+
14
+ import bitsandbytes as bnb
15
+ from datasets import Dataset, DatasetDict, load_dataset
16
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
17
+ import torch
18
+ from transformers.data.data_collator import DataCollatorForLanguageModeling
19
+ from transformers.trainer import Trainer
20
+ from transformers.training_args import TrainingArguments
21
+ from transformers.models.auto import AutoModelForCausalLM, AutoTokenizer
22
+ from transformers.utils.quantization_config import BitsAndBytesConfig
23
+
24
+ from project_settings import project_path
25
+ from toolbox.transformers.data.dataset.dataset import SFTDataset, ChatGLM2SFTDataset
26
+ from toolbox.transformers.data.data_collator import SFTDataCollator
27
+ from toolbox.transformers.modules.loss import TargetLMLoss
28
+ from toolbox.transformers.trainer import LoRATrainer
29
+
30
+
31
+ def get_args():
32
+ """
33
+ python3 2.train_model.py --pretrained_model_name_or_path /data/tianxing/PycharmProjects/Transformers/pretrained_models/huggingface/YeungNLP/firefly-chatglm2-6b
34
+
35
+ """
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--dataset_path", default="Iess/chinese_modern_poetry", type=str)
38
+ parser.add_argument("--dataset_name", default=None, type=str)
39
+ parser.add_argument("--dataset_split", default=None, type=str)
40
+ parser.add_argument(
41
+ "--dataset_cache_dir",
42
+ default=(project_path / "hub_datasets").as_posix(),
43
+ type=str
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--pretrained_model_name_or_path",
48
+ default="Qwen/Qwen-7B",
49
+ type=str
50
+ )
51
+ parser.add_argument("--cache_dir", default="cache_dir", type=str)
52
+
53
+ # train
54
+ parser.add_argument("--output_dir", default="serialization_dir", type=str)
55
+ parser.add_argument("--overwrite_output_dir", action="store_true")
56
+ parser.add_argument("--evaluation_strategy", default="no", choices=["no", "steps", "epoch"], type=str)
57
+ parser.add_argument("--per_device_train_batch_size", default=4, type=int)
58
+ parser.add_argument("--gradient_accumulation_steps", default=4, type=int)
59
+ parser.add_argument("--learning_rate", default=2e-4, type=float)
60
+ parser.add_argument("--weight_decay", default=0, type=float)
61
+ parser.add_argument("--max_grad_norm", default=0.3, type=float)
62
+ parser.add_argument("--num_train_epochs", default=1.0, type=float)
63
+ parser.add_argument("--max_steps", default=-1, type=int)
64
+ parser.add_argument("--lr_scheduler_type", default="constant_with_warmup", type=str)
65
+ parser.add_argument("--warmup_ratio", default=0.0, type=float)
66
+ parser.add_argument("--warmup_steps", default=3000, type=int)
67
+ parser.add_argument("--logging_steps", default=300, type=int)
68
+ parser.add_argument("--save_strategy", default="steps", type=str)
69
+ parser.add_argument("--save_steps", default=500, type=int)
70
+ parser.add_argument("--save_total_limit", default=2, type=int)
71
+ parser.add_argument("--no_cuda", action="store_true")
72
+ parser.add_argument("--seed", default=3407, type=str, help="https://arxiv.org/abs/2109.08203")
73
+ # parser.add_argument("--fp16", action="store_true")
74
+ parser.add_argument("--fp16", action="store_false")
75
+ parser.add_argument("--half_precision_backend", default="auto", type=str)
76
+ parser.add_argument("--dataloader_num_workers", default=0, type=int)
77
+ parser.add_argument("--disable_tqdm", action="store_true")
78
+ # parser.add_argument("--disable_tqdm", action="store_false")
79
+ parser.add_argument("--remove_unused_columns", action="store_true")
80
+ # parser.add_argument("--remove_unused_columns", action="store_false")
81
+ # parser.add_argument("--deepspeed", default="ds_z3_config.json", type=str)
82
+ parser.add_argument("--deepspeed", default=None, type=str)
83
+ parser.add_argument("--optim", default="paged_adamw_32bit", type=str)
84
+ parser.add_argument("--report_to", default="tensorboard", type=str)
85
+ parser.add_argument("--resume_from_checkpoint", default="file_dir/serialization_dir/checkpoint-103000", type=str)
86
+ # parser.add_argument("--gradient_checkpointing", action="store_true")
87
+ parser.add_argument("--gradient_checkpointing", action="store_false")
88
+
89
+ # dataset process
90
+ parser.add_argument("--truncate_longer_samples", action="store_true")
91
+ parser.add_argument("--max_seq_length", default=1024, type=int)
92
+
93
+ # lora
94
+ parser.add_argument("--lora_rank", default=64, type=int)
95
+ parser.add_argument("--lora_alpha", default=16, type=int)
96
+ parser.add_argument("--lora_dropout", default=0.05, type=int)
97
+
98
+ args = parser.parse_args()
99
+ return args
100
+
101
+
102
+ def verify_model_dtype(model):
103
+ """
104
+ 查看模型种各种类型的参数的情况
105
+ """
106
+ dtype2param_num = defaultdict(int) # 每种数据类型的参数量
107
+ dtype2param_name = defaultdict(list) # 每种数据类型的参数名称
108
+ dtype2trainable_param_num = defaultdict(int) # 每种数据类型参与训练的参数量
109
+ dtype2trainable_param_name = defaultdict(list) # 每种数据类型参与训练的参数名称
110
+ for name, p in model.named_parameters():
111
+ dtype = p.dtype
112
+ dtype2param_num[dtype] += p.numel()
113
+ dtype2param_name[dtype].append(name)
114
+ if p.requires_grad:
115
+ dtype2trainable_param_num[dtype] += p.numel()
116
+ dtype2trainable_param_name[dtype].append(name)
117
+
118
+ # 统计全部参数中, 各种类型参数分布.
119
+ total = 0
120
+ print('verify all params of the model')
121
+ for k, v in dtype2param_num.items():
122
+ total += v
123
+ for k, v in dtype2param_num.items():
124
+ print(k, v, v / total)
125
+ for k, v in dtype2trainable_param_name.items():
126
+ print(k, v)
127
+
128
+ print()
129
+ # 统计可训练参数中, 各种类型参数分布.
130
+ print('verify trainable params the model')
131
+ total_trainable = 0
132
+ for k, v in dtype2trainable_param_num.items():
133
+ total_trainable += v
134
+ for k, v in dtype2trainable_param_num.items():
135
+ print(k, v, v / total_trainable)
136
+ for k, v in dtype2trainable_param_num.items():
137
+ print(k, v)
138
+
139
+
140
+ def find_all_linear_names(model):
141
+ """
142
+ 找出所有全连接层,为所有全连接添加adapter
143
+ """
144
+ cls = bnb.nn.Linear4bit
145
+ lora_module_names = set()
146
+ for name, module in model.named_modules():
147
+ if isinstance(module, cls):
148
+ names = name.split('.')
149
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
150
+
151
+ if 'lm_head' in lora_module_names: # needed for 16-bit
152
+ lora_module_names.remove('lm_head')
153
+ return list(lora_module_names)
154
+
155
+
156
+ def main():
157
+ args = get_args()
158
+
159
+ os.makedirs(args.output_dir, exist_ok=True)
160
+ os.makedirs(args.cache_dir, exist_ok=True)
161
+
162
+ # dataset
163
+ dataset_dict = load_dataset(
164
+ path=args.dataset_path,
165
+ name=args.dataset_name,
166
+ split=args.dataset_split,
167
+ cache_dir=args.dataset_cache_dir
168
+ )
169
+ train_dataset = dataset_dict["train"]
170
+ print(train_dataset)
171
+
172
+ # training_args
173
+ training_args = TrainingArguments(
174
+ output_dir=args.output_dir,
175
+ overwrite_output_dir=args.overwrite_output_dir,
176
+ evaluation_strategy=args.evaluation_strategy,
177
+ per_device_train_batch_size=args.per_device_train_batch_size,
178
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
179
+ learning_rate=args.learning_rate,
180
+ weight_decay=args.weight_decay,
181
+ max_grad_norm=args.max_grad_norm,
182
+ num_train_epochs=args.num_train_epochs,
183
+ max_steps=args.max_steps,
184
+ lr_scheduler_type=args.lr_scheduler_type,
185
+ warmup_steps=args.warmup_steps,
186
+ logging_steps=args.logging_steps,
187
+ save_strategy=args.save_strategy,
188
+ save_steps=args.save_steps,
189
+ save_total_limit=args.save_total_limit,
190
+ no_cuda=args.no_cuda,
191
+ fp16=args.fp16,
192
+ half_precision_backend=args.half_precision_backend,
193
+ dataloader_num_workers=args.dataloader_num_workers,
194
+ disable_tqdm=args.disable_tqdm,
195
+ remove_unused_columns=args.remove_unused_columns,
196
+ # deepspeed=args.deepspeed,
197
+ optim=args.optim,
198
+ report_to=args.report_to,
199
+ resume_from_checkpoint=args.resume_from_checkpoint,
200
+ gradient_checkpointing=args.gradient_checkpointing,
201
+ )
202
+
203
+ # pretrained model
204
+ model = AutoModelForCausalLM.from_pretrained(
205
+ args.pretrained_model_name_or_path,
206
+ device_map={"": 0},
207
+ load_in_4bit=True,
208
+ torch_dtype=torch.float16,
209
+ trust_remote_code=True,
210
+ quantization_config=BitsAndBytesConfig(
211
+ load_in_4bit=True,
212
+ bnb_4bit_compute_dtype=torch.float16,
213
+ bnb_4bit_use_double_quant=True,
214
+ bnb_4bit_quant_type="nf4",
215
+ llm_int8_threshold=6.0,
216
+ llm_int8_has_fp16_weight=False,
217
+ ),
218
+ )
219
+ # tokenizer
220
+ tokenizer = AutoTokenizer.from_pretrained(
221
+ args.pretrained_model_name_or_path,
222
+ trust_remote_code=True,
223
+ use_fast=False if model.config.model_type == "llama" else True
224
+ )
225
+ # QWenTokenizer比较特殊, pad_token_id, bos_token_id, eos_token_id 均 为None. eod_id对应的token为<|endoftext|>
226
+ if tokenizer.__class__.__name__ == "QWenTokenizer":
227
+ tokenizer.pad_token_id = tokenizer.eod_id
228
+ tokenizer.bos_token_id = tokenizer.eod_id
229
+ tokenizer.eos_token_id = tokenizer.eod_id
230
+
231
+ # model
232
+ # casts all the non int8 modules to full precision (fp32) for stability
233
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing)
234
+ print(f"memory footprint of model: {model.get_memory_footprint() / (1024*1024*1024)} GB")
235
+
236
+ # 找到所有需要插入adapter的全连接层
237
+ target_modules = find_all_linear_names(model)
238
+ config = LoraConfig(
239
+ r=args.lora_rank,
240
+ lora_alpha=args.lora_alpha,
241
+ target_modules=target_modules,
242
+ lora_dropout=args.lora_dropout,
243
+ bias="none",
244
+ task_type="CAUSAL_LM",
245
+ )
246
+ model = get_peft_model(model, config)
247
+ model.print_trainable_parameters()
248
+ model.config.torch_dtype = torch.float32
249
+
250
+ # 查看模型种各种类型的参数的情况
251
+ verify_model_dtype(model)
252
+
253
+ # 初始化损失函数
254
+ loss_func = TargetLMLoss(ignore_index=-100)
255
+
256
+ data_collator = SFTDataCollator(tokenizer, args.max_seq_length)
257
+
258
+ # dataset
259
+ def encode_with_truncation(examples):
260
+ prompt_ = examples.pop('prompt')
261
+ response_ = examples.pop('response')
262
+ utterances = [
263
+ prompt_,
264
+ response_
265
+ ]
266
+
267
+ utterances_ids = tokenizer(utterances, add_special_tokens=False).input_ids
268
+
269
+ input_ids = [tokenizer.bos_token_id]
270
+ target_mask = [0]
271
+ for i, utterances_id in enumerate(utterances_ids):
272
+ input_ids += (utterances_id + [tokenizer.eos_token_id])
273
+
274
+ if i % 2 == 0:
275
+ target_mask += [0] * (len(utterances_id) + 1)
276
+ else:
277
+ target_mask += [1] * (len(utterances_id) + 1)
278
+
279
+ assert len(input_ids) == len(target_mask)
280
+
281
+ input_ids = input_ids[:args.max_seq_length]
282
+ target_mask = target_mask[:args.max_seq_length]
283
+ attention_mask = [1] * len(input_ids)
284
+
285
+ assert len(input_ids) == len(target_mask) == len(attention_mask)
286
+
287
+ inputs = {
288
+ "input_ids": input_ids,
289
+ "attention_mask": attention_mask,
290
+ "target_mask": target_mask
291
+ }
292
+ return inputs
293
+
294
+ train_dataset = train_dataset.map(
295
+ encode_with_truncation,
296
+ batched=False,
297
+ keep_in_memory=False,
298
+ num_proc=None if platform.system() == "Windows" else os.cpu_count(),
299
+ cache_file_name=os.path.join(args.cache_dir, "train.cache")
300
+ )
301
+ train_dataset.set_format(type=None, columns=["input_ids", "attention_mask", "target_mask"])
302
+ print("Train Dataset Examples Batch Number: {}".format(len(train_dataset)))
303
+
304
+ # 初始化 Trainer
305
+ trainer = LoRATrainer(
306
+ model=model,
307
+ args=training_args,
308
+ train_dataset=train_dataset,
309
+ # tokenizer=tokenizer,
310
+ data_collator=data_collator,
311
+ compute_loss=loss_func
312
+ )
313
+ train_result = trainer.train()
314
+
315
+ # 保存最好的 checkpoint
316
+ final_save_path = os.path.join(training_args.output_dir, "final")
317
+ trainer.save_model(final_save_path) # Saves the tokenizer too
318
+ # 保存训练指标
319
+ metrics = train_result.metrics
320
+ trainer.log_metrics("train", metrics)
321
+ trainer.save_metrics("train", metrics)
322
+ trainer.save_state()
323
+ return
324
+
325
+
326
+ if __name__ == '__main__':
327
+ main()
examples/exercises/chinese_modern_poetry/3.merge_lora.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from peft import PeftModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
7
+ import torch
8
+ """
9
+ 使用该脚本,将lora的权重合并大base model中
10
+ """
11
+
12
+
13
+ def get_args():
14
+ """
15
+ python3 3.merge_lora.py \
16
+ --pretrained_model_name_or_path /data/tianxing/PycharmProjects/Transformers/pretrained_models/huggingface/Qwen/Qwen-7B \
17
+ --adapter_name_or_path /data/tianxing/PycharmProjects/Transformers/examples/exercises/chinese_modern_poetry/file_dir/serialization_dir/checkpoint-27000 \
18
+ --save_directory /data/tianxing/PycharmProjects/Transformers/trained_models/qwen_7b_modern_poetry
19
+
20
+ """
21
+ parser = argparse.ArgumentParser()
22
+
23
+ parser.add_argument(
24
+ "--pretrained_model_name_or_path",
25
+ default="YeungNLP/firefly-chatglm2-6b",
26
+ type=str
27
+ )
28
+ parser.add_argument(
29
+ "--adapter_name_or_path",
30
+ default="YeungNLP/firefly-baichuan-7b-qlora-sft",
31
+ type=str
32
+ )
33
+ parser.add_argument("--save_directory", default="save_directory", type=str)
34
+
35
+ args = parser.parse_args()
36
+ return args
37
+
38
+
39
+ def main():
40
+ args = get_args()
41
+
42
+ config = AutoConfig.from_pretrained(
43
+ args.pretrained_model_name_or_path,
44
+ trust_remote_code=True,
45
+ )
46
+ tokenizer = AutoTokenizer.from_pretrained(
47
+ args.pretrained_model_name_or_path,
48
+ trust_remote_code=True,
49
+ # llama不支持fast
50
+ use_fast=False if config.model_type == 'llama' else True
51
+ )
52
+
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ args.pretrained_model_name_or_path,
55
+ trust_remote_code=True,
56
+ low_cpu_mem_usage=True,
57
+ torch_dtype=torch.float16,
58
+ # device_map='auto',
59
+ device_map={"": "cpu"}
60
+ )
61
+ model = PeftModel.from_pretrained(model, args.adapter_name_or_path, device_map={"": "cpu"})
62
+ model = model.merge_and_unload()
63
+
64
+ tokenizer.save_pretrained(args.save_directory)
65
+ model.save_pretrained(args.save_directory)
66
+ return
67
+
68
+
69
+ if __name__ == '__main__':
70
+ main()
examples/exercises/chinese_modern_poetry/4.test_model.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ pwd = os.path.abspath(os.path.dirname(__file__))
8
+ sys.path.append(os.path.join(pwd, '../../../'))
9
+
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+ import torch
12
+
13
+ from project_settings import project_path
14
+ """
15
+ 单轮对话,不具有对话历史的记忆功能
16
+ """
17
+
18
+
19
+ def get_args():
20
+ """
21
+ python3 4.test_model.py --pretrained_model_name_or_path /data/tianxing/PycharmProjects/Transformers/trained_models/qwen_7b_chinese_modern_poetry
22
+ python3 4.test_model.py --pretrained_model_name_or_path /data/tianxing/PycharmProjects/Transformers/trained_models/qwen_7b_modern_poetry
23
+
24
+ python3 4.test_model.py --pretrained_model_name_or_path /data/tianxing/PycharmProjects/Transformers/pretrained_models/huggingface/Qwen/Qwen-7B
25
+
26
+
27
+ """
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument(
30
+ "--pretrained_model_name_or_path",
31
+ # default="YeungNLP/firefly-chatglm2-6b",
32
+ default=(project_path / "trained_models/firefly_chatglm2_6b_intent").as_posix(),
33
+ type=str
34
+ )
35
+ parser.add_argument("--max_new_tokens", default=512, type=int)
36
+ parser.add_argument("--top_p", default=0.9, type=float)
37
+ parser.add_argument("--temperature", default=0.35, type=float)
38
+ parser.add_argument("--repetition_penalty", default=1.0, type=float)
39
+ parser.add_argument('--device', default="cuda" if torch.cuda.is_available() else "cpu", type=str)
40
+
41
+ args = parser.parse_args()
42
+ return args
43
+
44
+
45
+ def main():
46
+ args = get_args()
47
+
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ args.pretrained_model_name_or_path,
50
+ trust_remote_code=True,
51
+ # low_cpu_mem_usage=True,
52
+ torch_dtype=torch.float16,
53
+ # device_map="auto",
54
+ device_map={"": 0},
55
+ # offload_folder="./offload",
56
+ ).to(args.device).eval()
57
+
58
+ tokenizer = AutoTokenizer.from_pretrained(
59
+ args.pretrained_model_name_or_path,
60
+ trust_remote_code=True,
61
+ # llama不支持fast
62
+ use_fast=False if model.config.model_type == "llama" else True,
63
+ padding_side="left"
64
+
65
+ )
66
+
67
+ # QWenTokenizer比较特殊, pad_token_id, bos_token_id, eos_token_id 均 为None. eod_id对应的token为<|endoftext|>
68
+ if tokenizer.__class__.__name__ == "QWenTokenizer":
69
+ tokenizer.pad_token_id = tokenizer.eod_id
70
+ tokenizer.bos_token_id = tokenizer.eod_id
71
+ tokenizer.eos_token_id = tokenizer.eod_id
72
+
73
+ text = input("User: ")
74
+ while True:
75
+ text = text.strip()
76
+ # chatglm使用官方的数据组织格式
77
+ if model.config.model_type == "chatglm":
78
+ text = "[Round 1]\n\n问:{}\n\n答:".format(text)
79
+ input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.to(args.device)
80
+ # 为了兼容qwen-7b,因为其对eos_token进行tokenize,无法得到对应的eos_token_id
81
+ else:
82
+ input_ids = tokenizer(
83
+ text,
84
+ return_tensors="pt",
85
+ add_special_tokens=False,
86
+ ).input_ids.to(args.device)
87
+ bos_token_id = torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).to(args.device)
88
+ eos_token_id = torch.tensor([[tokenizer.eos_token_id]], dtype=torch.long).to(args.device)
89
+ input_ids = torch.concat([bos_token_id, input_ids, eos_token_id], dim=1)
90
+ with torch.no_grad():
91
+ outputs = model.generate(
92
+ input_ids=input_ids, max_new_tokens=args.max_new_tokens, do_sample=True,
93
+ top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty,
94
+ eos_token_id=tokenizer.eos_token_id
95
+ )
96
+ outputs = outputs.tolist()[0][len(input_ids[0]):]
97
+ response = tokenizer.decode(outputs)
98
+ response = response.strip().replace(tokenizer.eos_token, "").strip()
99
+ print("LLM: {}".format(response))
100
+ text = input('User: ')
101
+
102
+
103
+ if __name__ == '__main__':
104
+ main()
examples/exercises/chinese_modern_poetry/run.sh ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # sh run.sh --stage 0 --stop_stage 0 --system_version centos
4
+ # sh run.sh --stage 1 --stop_stage 1 --system_version centos
5
+ # sh run.sh --stage 2 --stop_stage 2 --system_version centos
6
+ # sh run.sh --stage 4 --stop_stage 4 --system_version centos --final_model_name qwen_7b_modern_poetry
7
+
8
+ # bitsandbytes
9
+ export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
10
+
11
+ # params
12
+ system_version="windows";
13
+ verbose=true;
14
+ stage=0 # start from 0 if you need to start from data preparation
15
+ stop_stage=5
16
+
17
+ pretrained_model_supplier=Qwen
18
+ pretrained_model_name=Qwen-7B
19
+
20
+ final_checkpoint_dir=final
21
+ final_model_name=qwen_7b_modern_poetry
22
+
23
+ patience=0
24
+
25
+
26
+ # parse options
27
+ while true; do
28
+ [ -z "${1:-}" ] && break; # break if there are no arguments
29
+ case "$1" in
30
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
31
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
32
+ old_value="(eval echo \\$$name)";
33
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
34
+ was_bool=true;
35
+ else
36
+ was_bool=false;
37
+ fi
38
+
39
+ # Set the variable to the right value-- the escaped quotes make it work if
40
+ # the option had spaces, like --cmd "queue.pl -sync y"
41
+ eval "${name}=\"$2\"";
42
+
43
+ # Check that Boolean-valued arguments are really Boolean.
44
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
45
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
46
+ exit 1;
47
+ fi
48
+ shift 2;
49
+ ;;
50
+
51
+ *) break;
52
+ esac
53
+ done
54
+
55
+
56
+ $verbose && echo "system_version: ${system_version}"
57
+
58
+ work_dir="$(pwd)"
59
+ file_dir="${work_dir}/file_dir"
60
+ cache_dir="${file_dir}/cache_dir"
61
+ serialization_dir="${file_dir}/serialization_dir"
62
+
63
+ pretrained_models_dir="${work_dir}/../../../pretrained_models/huggingface/${pretrained_model_supplier}"
64
+ final_model_dir="${work_dir}/../../../trained_models/${final_model_name}";
65
+
66
+ mkdir -p "${file_dir}"
67
+ mkdir -p "${cache_dir}"
68
+ mkdir -p "${serialization_dir}"
69
+ mkdir -p "${pretrained_models_dir}"
70
+ mkdir -p "${final_model_dir}"
71
+
72
+ export PYTHONPATH="${work_dir}/../../.."
73
+
74
+
75
+ if [ $system_version == "windows" ]; then
76
+ alias python3='C:/Users/tianx/PycharmProjects/virtualenv/Transformers/Scripts/python.exe'
77
+ elif [ $system_version == "centos" ]; then
78
+ # conda activate Transformers
79
+ alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
80
+ elif [ $system_version == "ubuntu" ]; then
81
+ alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
82
+ elif [ $system_version == "macos" ]; then
83
+ alias python3='/Users/honey/PycharmProjects/virtualenv/TrainLLM/bin/python'
84
+ fi
85
+
86
+
87
+ function search_best_ckpt() {
88
+ patience="$1";
89
+
90
+ cd "${serialization_dir}" || exit 1
91
+ last_epoch=$(ls . | \
92
+ grep "checkpoint-*" | \
93
+ awk -F'[-]' '{print$2}' | \
94
+ sort -n | \
95
+ awk 'END {print}')
96
+
97
+ target_dir=
98
+ if [ -n "${last_epoch}" ]; then
99
+ target_epoch=$((last_epoch - patience))
100
+
101
+ for epoch_idx in $(ls . | grep "checkpoint-*" | awk -F'[-]' '{print$2}' | sort -nr):
102
+ do
103
+ if [ "${epoch_idx}" -le "${target_epoch}" ]; then
104
+ target_dir="checkpoint-${epoch_idx}";
105
+ break;
106
+ fi
107
+ done
108
+ fi
109
+
110
+ echo "${target_dir}"
111
+ }
112
+
113
+
114
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
115
+ $verbose && echo "stage 0: download pretrained model"
116
+ cd "${pretrained_models_dir}" || exit 1;
117
+
118
+ if [ ! -d "${pretrained_model_name}" ]; then
119
+ git clone "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/"
120
+
121
+ rm -rf .git
122
+ rm -rf .gitattributes
123
+ rm -rf flax_model.msgpack
124
+ rm -rf model.safetensors
125
+ fi
126
+
127
+ cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
128
+
129
+ # pytorch_model.bin
130
+ if [ -e "pytorch_model.bin" ]; then
131
+ data_size=$(ls -l pytorch_model.bin | awk '{print $5}')
132
+ if [ "${data_size}" == "135" ]; then
133
+ rm -rf pytorch_model.bin;
134
+ fi
135
+ fi
136
+ if [ ! -e "pytorch_model.bin" ]; then
137
+ wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/pytorch_model.bin"
138
+ fi
139
+
140
+ # tokenizer.json
141
+ if [ -e "tokenizer.json" ]; then
142
+ data_size=$(ls -l tokenizer.json | awk '{print $5}')
143
+ if [ "${data_size}" == "135" ]; then
144
+ rm -rf tokenizer.json;
145
+ fi
146
+ fi
147
+ if [ ! -e "tokenizer.json" ]; then
148
+ wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/tokenizer.json"
149
+ fi
150
+
151
+ fi
152
+
153
+
154
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
155
+ $verbose && echo "stage 1: prepare data"
156
+ cd "${work_dir}" || exit 1;
157
+
158
+ python3 1.prepare_data.py
159
+
160
+ fi
161
+
162
+
163
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
164
+ $verbose && echo "stage 2: train model"
165
+ cd "${work_dir}" || exit 1;
166
+
167
+ python3 2.train_model.py \
168
+ --pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
169
+ --cache_dir "${cache_dir}" \
170
+ --output_dir "${serialization_dir}"
171
+
172
+ fi
173
+
174
+
175
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
176
+ $verbose && echo "stage 3: merge lora"
177
+ cd "${work_dir}" || exit 1;
178
+
179
+ python3 3.merge_lora.py \
180
+ --pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
181
+ --adapter_name_or_path "${serialization_dir}/${final_checkpoint_dir}" \
182
+ --save_directory "${final_model_dir}"
183
+
184
+ fi
185
+
186
+
187
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
188
+ $verbose && echo "stage 4: collect files"
189
+ cd "${work_dir}" || exit 1;
190
+
191
+ cp "${pretrained_models_dir}/${pretrained_model_name}/configuration_qwen.py" "${final_model_dir}/configuration_qwen.py"
192
+ cp "${pretrained_models_dir}/${pretrained_model_name}/modeling_qwen.py" "${final_model_dir}/modeling_qwen.py"
193
+ cp "${pretrained_models_dir}/${pretrained_model_name}/qwen_generation_utils.py" "${final_model_dir}/qwen_generation_utils.py"
194
+ cp "${pretrained_models_dir}/${pretrained_model_name}/tokenization_qwen.py" "${final_model_dir}/tokenization_qwen.py"
195
+
196
+ fi
examples/exercises/chinese_modern_poetry/stop.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ #!/usr/bin/env bash
main.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+
6
+ import gradio as gr
7
+ from transformers import AutoModel, AutoTokenizer
8
+ from transformers.models.auto import AutoModelForCausalLM, AutoTokenizer
9
+ # from transformers.utils.quantization_config import BitsAndBytesConfig
10
+ import torch
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("--train_subset", default="train.jsonl", type=str)
18
+ parser.add_argument("--valid_subset", default="valid.jsonl", type=str)
19
+ parser.add_argument(
20
+ "--pretrained_model_name_or_path",
21
+ # default="YeungNLP/firefly-chatglm2-6b",
22
+ default=(project_path / "trained_models/firefly_chatglm2_6b_intent").as_posix(),
23
+ type=str
24
+ )
25
+ parser.add_argument("--output_file", default="result.xlsx", type=str)
26
+
27
+ parser.add_argument("--max_new_tokens", default=512, type=int)
28
+ parser.add_argument("--top_p", default=0.9, type=float)
29
+ parser.add_argument("--temperature", default=0.35, type=float)
30
+ parser.add_argument("--repetition_penalty", default=1.0, type=float)
31
+ parser.add_argument('--device', default="cuda" if torch.cuda.is_available() else "cpu", type=str)
32
+
33
+ args = parser.parse_args()
34
+ return args
35
+
36
+
37
+ description = """
38
+ ## ChatGLM-6B
39
+
40
+ 基于 [firefly-chatglm2-6b](https://huggingface.co/YeungNLP/firefly-chatglm2-6b) 模型, 在 [telemarketing_intent](https://huggingface.co/datasets/qgyd2021/telemarketing_intent/tree/main/data/prompt) 的 prompt 数据集上训练, 目的是实现 `电话营销` 场景的 1-shot 意图识别.
41
+
42
+ 该分类任务有一百多个类别, 但标注数据总是只有 3 万, 并且有一半是 "无关领域", 实现思路是:
43
+ 1. 首先采用传统算法做硬分类, 然后提取概率 top 10 的标签.
44
+ 2. 将 top 10 的标签作为候选标签, 并为每个标签提供一个句子示例.
45
+ 3. 要求 LLM 输出目标句子的类别.
46
+
47
+ Gradio 布署代码参考了: https://huggingface.co/spaces/aodianyun/ChatGLM-6B
48
+
49
+ """
50
+
51
+
52
+ examples = [
53
+ """我们在做电话营销场景的意图识别任务, 可选的意图如下:
54
+ 否定(不是); 礼貌用语; 否定答复; 肯定(需要); 用户正忙; 否定(不需要); 无关领域; 否定(没有); 否定(不用了); 价格太高
55
+
56
+ 如果你认为给定的句子不属于这些意图中的任务一个, 你可以回答: 不知道.
57
+
58
+ Tips:
59
+ 1. 如果候选意图中有 "无关领域", 当你不知道时, 则它有可能属于无关领域.
60
+
61
+
62
+ Examples:
63
+
64
+ ---------
65
+
66
+ ExampleSentence: 其实不是
67
+ ExampleIntent: 否定(不是)
68
+
69
+ ExampleSentence: 嗯!嘿嘿!早点休息,晚安咯
70
+ ExampleIntent: 礼貌用语
71
+
72
+ ExampleSentence: 没问诶
73
+ ExampleIntent: 否定答复
74
+
75
+ ExampleSentence: 不好意思都需要谢谢
76
+ ExampleIntent: 肯定(需要)
77
+
78
+ ExampleSentence: 对呀我在忙
79
+ ExampleIntent: 用户正忙
80
+
81
+ ExampleSentence: 。嗯也也不需要吧唉呀现在不需要那个啊嗯
82
+ ExampleIntent: 否定(不需要)
83
+
84
+ ExampleSentence: 我的处理器需要很少的电源。
85
+ ExampleIntent: 无关领域
86
+
87
+ ExampleSentence: 。呃我好像没有在太平洋买过保险,吧拜拜
88
+ ExampleIntent: 否定(没有)
89
+
90
+ ExampleSentence: 嗯不用谢谢
91
+ ExampleIntent: 否定(不用了)
92
+
93
+ ExampleSentence: 费用贵。
94
+ ExampleIntent: 价格太高
95
+
96
+ ---------
97
+
98
+ Sentence: 。嗯各位不需要,啊谢谢
99
+ Intent:""",
100
+ """我们在做电话营销场景的意图识别任务, 可选的意图如下:
101
+ 语音信箱; 无关领域; 查物品信息; 污言秽语; 疑问(时间); 疑问(数值); 答时间; 查收费方式; 价格太高; 答数值
102
+
103
+ 如果你认为给定的句子不属于这些意图中的任务一个, 你可以回答: 不知道.
104
+
105
+ Tips:
106
+ 1. 如果候选意图中有 "无关领域", 当你不知道时, 则它有可能属于无关领域.
107
+
108
+
109
+ Examples:
110
+ ---------
111
+
112
+ ExampleSentence: 我们留言。
113
+ ExampleIntent: 语音信箱
114
+
115
+ ExampleSentence: 很刚刚打
116
+ ExampleIntent: 无关领域
117
+
118
+ ExampleSentence: 什么东西我听
119
+ ExampleIntent: 查物品信息
120
+
121
+ ExampleSentence: 知道!AV女优!日本人的骄傲!
122
+ ExampleIntent: 污言秽语
123
+
124
+ ExampleSentence: 最后期限
125
+ ExampleIntent: 疑问(时间)
126
+
127
+ ExampleSentence: 一共借了多少钱
128
+ ExampleIntent: 疑问(数值)
129
+
130
+ ExampleSentence: 22号
131
+ ExampleIntent: 答时间
132
+
133
+ ExampleSentence: 运费
134
+ ExampleIntent: 查收费方式
135
+
136
+ ExampleSentence: 利息高
137
+ ExampleIntent: 价格太高
138
+
139
+ ExampleSentence: 20。
140
+ ExampleIntent: 答数值
141
+
142
+ ---------
143
+
144
+ Sentence: 。对啊什么东西啊我6月份出来的
145
+ Intent:"""
146
+ ]
147
+
148
+
149
+ def main():
150
+ args = get_args()
151
+
152
+ use_cpu = os.environ.get("USE_CPU", "all")
153
+
154
+ tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path, trust_remote_code=True)
155
+ # QWenTokenizer比较特殊, pad_token_id, bos_token_id, eos_token_id 均 为None. eod_id对应的token为<|endoftext|>
156
+ if tokenizer.__class__.__name__ == "QWenTokenizer":
157
+ tokenizer.pad_token_id = tokenizer.eod_id
158
+ tokenizer.bos_token_id = tokenizer.eod_id
159
+ tokenizer.eos_token_id = tokenizer.eod_id
160
+
161
+ if not use_cpu:
162
+ model = AutoModel.from_pretrained(
163
+ args.pretrained_model_name_or_path,
164
+ trust_remote_code=True
165
+ ).half().cuda()
166
+ else:
167
+ model = AutoModelForCausalLM.from_pretrained(
168
+ args.pretrained_model_name_or_path,
169
+ trust_remote_code=True,
170
+ low_cpu_mem_usage=True,
171
+ torch_dtype=torch.bfloat16,
172
+ device_map="auto",
173
+ offload_folder="./offload",
174
+ offload_state_dict=True,
175
+ # load_in_4bit=True,
176
+ )
177
+ model = model.eval()
178
+
179
+ def fn(inputs, history=None):
180
+ if history is None:
181
+ history = list()
182
+
183
+ with torch.no_grad():
184
+ response, history = model.chat(tokenizer, inputs, history)
185
+
186
+ return history, history
187
+
188
+ with gr.Blocks() as blocks:
189
+ gr.Markdown(value=description)
190
+
191
+ state = gr.State([])
192
+
193
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
194
+ with gr.Row():
195
+ with gr.Column(scale=4):
196
+ text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
197
+ with gr.Column(scale=1):
198
+ button = gr.Button("Generate")
199
+
200
+ gr.Examples(examples, text)
201
+
202
+ text.submit(fn, [text, state], [chatbot, state])
203
+ button.click(fn, [text, state], [chatbot, state])
204
+
205
+ blocks.queue().launch()
206
+
207
+ return
208
+
209
+
210
+ if __name__ == '__main__':
211
+ main()
project_settings.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from toolbox.os.environment import EnvironmentManager
7
+
8
+
9
+ project_path = os.path.abspath(os.path.dirname(__file__))
10
+ project_path = Path(project_path)
11
+
12
+
13
+ environment = EnvironmentManager(
14
+ path=os.path.join(project_path, 'dotenv'),
15
+ env=os.environ.get('environment', 'dev'),
16
+ )
17
+
18
+
19
+ if __name__ == '__main__':
20
+ pass
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.20.1
2
+ pydantic==1.10.12
3
+ thinc==7.4.6
4
+ spacy==2.3.9
5
+ accelerate==0.21.0
6
+ transformers==4.30.2
7
+ peft==0.4.0
8
+ bitsandbytes==0.39.0
9
+ numpy==1.21.4
10
+ pandas==1.2.5
11
+ tqdm==4.62.3
12
+ torch==1.13.0
13
+ datasets
14
+ python-dotenv==1.0.0
15
+ sentencepiece==0.1.99
16
+ scipy==1.10.1
script/install_bitsandbytes.sh ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ #bitsandbytes
4
+ #https://github.com/TimDettmers/bitsandbytes
5
+ #
6
+ ### 安装bitsandbytes
7
+ #
8
+ #bitsandbytes 是 CUDA 自定义函数的轻量级包装器, 特别是 8 位优化器, 矩阵乘法 (LLM.int8()) 和量化函数.
9
+ #
10
+ #### 安装
11
+ #
12
+ #通过 `pip3 install bitsandbytes` 来安装.
13
+ #
14
+ #安装之后通过 `python -m bitsandbytes` 来验证安装是否成功.
15
+ #
16
+ #在某些情况下可能需要从源代码进行编译.
17
+ #
18
+ #```text
19
+ #git clone https://github.com/timdettmers/bitsandbytes.git
20
+ #cd bitsandbytes
21
+ #
22
+ ## CUDA_VERSIONS in {110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 120}
23
+ ## make argument in {cuda110, cuda11x, cuda12x}
24
+ ## if you do not know what CUDA you have, try looking at the output of: python -m bitsandbytes
25
+ #CUDA_VERSION=117 make cuda11x
26
+ #python setup.py install
27
+ #```
28
+ #
29
+ #### 备注
30
+ #
31
+ ##### 必须安装与 GPU 版本相匹配的 CUDA
32
+ #
33
+ #我的情况如下:
34
+ #
35
+ #**GPU 和 CUDA 版本. **
36
+ #
37
+ #```text
38
+ ## nvidia-smi
39
+ #Mon Aug 28 14:38:32 2023
40
+ #+-----------------------------------------------------------------------------+
41
+ #| NVIDIA-SMI 515.105.01 Driver Version: 515.105.01 CUDA Version: 11.7 |
42
+ #|-------------------------------+----------------------+----------------------+
43
+ #| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
44
+ #| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
45
+ #| | | MIG M. |
46
+ #|===============================+======================+======================|
47
+ #| 0 Tesla V100S-PCI... Off | 00000000:0B:00.0 Off | 0 |
48
+ #| N/A 48C P0 40W / 250W | 14154MiB / 32768MiB | 0% Default |
49
+ #| | | N/A |
50
+ #+-------------------------------+----------------------+----------------------+
51
+ #
52
+ #+-----------------------------------------------------------------------------+
53
+ #| Processes: |
54
+ #| GPU GI CI PID Type Process name GPU Memory |
55
+ #| ID ID Usage |
56
+ #|=============================================================================|
57
+ #| 0 N/A N/A 11127 C python3 12973MiB |
58
+ #| 0 N/A N/A 25921 C python3 1177MiB |
59
+ #+-----------------------------------------------------------------------------+
60
+ #```
61
+ #
62
+ #**CUDA 版本. **
63
+ #
64
+ #我的经历, 安装 nvidia driver 驱动后 PyTorch 就可以使用 GPU 了, 同时 `nvidia-smi` 命令中也会显示 `CUDA Version: 11.7`.
65
+ #
66
+ #但是 `/usr/local/cuda:/usr/local/cuda-11.7` 是不存在的. 这个需要单独安装 (即: 安装 CUDA).
67
+ #
68
+ #```text
69
+ ## ll /usr/local/ | grep cuda
70
+ #lrwxrwxrwx 1 root root 20 Aug 15 19:12 cuda -> /usr/local/cuda-11.7
71
+ #drwxr-xr-x 14 root root 268 Aug 15 18:31 cuda-11.7
72
+ #```
73
+ #
74
+ ##### 从编译安装
75
+ #
76
+ #从编译安装使用的命令如下:
77
+ #
78
+ #```
79
+ #CUDA SETUP: Something unexpected happened. Please compile from source:
80
+ #git clone git@github.com:TimDettmers/bitsandbytes.git
81
+ #cd bitsandbytes
82
+ #CUDA_VERSION=117 make cuda11x_nomatmul
83
+ #python setup.py install
84
+ #```
85
+ #
86
+ #我的情况是没有使用容器, 在宿主机上安装的.
87
+ #
88
+ #1. 之前机器上安装的是 `/usr/local/cuda-10.4` 编译不通过. 因为 `/usr/local/cuda-10.4/bin` 下的 `nvcc` 编译器与 GPU 所需的 `CUDA Version: 11.7` 是不匹配的.
89
+ #2. 后来安装了 `/usr/local/cuda-11.7` 并删除 `/usr/local/cuda-10.4`, 但还是安装不成功.
90
+ #
91
+ #偶然的一次尝试:
92
+ #
93
+ #**需要使用conda虚拟环境**, 在 python 的 virtualenv 中安装失败了.
94
+ #
95
+ #**执行 `CUDA_VERSION=117 make cuda11x_nomatmul` 命令时, 确保以下几项正确**. 即:
96
+ #
97
+ #* `NVCC path`: 指向了 cuda 中的 nvcc 编译器. (`nvcc` 是 cuda 提供的一款编译器).
98
+ #
99
+ #* `CUDA_HOME`: cuda 安装的目录, 一般安装时会自动确定在 `/usr/local/cuda`,
100
+ #
101
+ #* `CONDA_PREFIX`: 是 `conda` 下创建的虚拟环境.
102
+ #
103
+ #* `PATH`: 应包含 cuda 的 bin 目录.
104
+ #
105
+ #* `LD_LIBARY_PATH`: 应包含 cuda 的 lib 目录.
106
+ #
107
+ #```
108
+ #(Transformers) [root@nlp bitsandbytes-0.39.1]# CUDA_VERSION=117 make cuda11x_nomatmul
109
+ #ENVIRONMENT
110
+ #============================
111
+ #CUDA_VERSION: 117
112
+ #============================
113
+ #NVCC path: /usr/local/cuda/bin/nvcc
114
+ #GPP path: /usr/bin/g++ VERSION: g++ (GCC) 11.1.0
115
+ #CUDA_HOME: /usr/local/cuda
116
+ #CONDA_PREFIX: /usr/local/miniconda3/envs/Transformers
117
+ #PATH: /usr/local/miniconda3/envs/Transformers/bin:/usr/local/miniconda3/condabin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin:/usr/local/cuda/bin:/root/bin
118
+ #LD_LIBRARY_PATH: /usr/local/cuda/lib64
119
+ #============================
120
+ #```
121
+ #
122
+ #我的情况是, 本应该是 `LD_LIBRARY_PATH: /usr/local/cuda/lib64` 的项变成了 `LD_LIBRARY_PATH:`.
123
+ #
124
+ #检查 `cat ~/.bashrc` 中包含:
125
+ #
126
+ #```text
127
+ #CUDA_HOME="/usr/local/cuda"
128
+ #PATH=/usr/local/sbin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin:/usr/local/miniconda3/bin:/usr/local/cuda/bin
129
+ #LD_LIBRARY_PATH=/usr/local/cuda/lib64
130
+ #```
131
+ #
132
+ #同时再执行
133
+ #
134
+ #```text
135
+ #export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
136
+ #```
137
+ #
138
+ #之后就编译成功了.
139
+ #
140
+ #**检查是否安装成功**
141
+ #
142
+ #重新连接 Terminal 之后, 在执行 `python -m bitsandbytes` 之前先执行以下命令.
143
+ #
144
+ #```text
145
+ #export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
146
+ #```
147
+ #
148
+ #这一步非常奇怪, 因为 `echo $LD_LIBRARY_PATH`, 可以看到 `/usr/local/cuda/lib64` 路径在其中. `echo $PATH` 都可以看到 `/usr/local/cuda` 在其中.
149
+ #
150
+ #但是执行 `CUDA_VERSION=117 make cuda11x_nomatmul` 时会发现原本应该是 `LD_LIBRARY_PATH: /usr/local/cuda/lib64` 的项变成了 `LD_LIBRARY_PATH:`.
151
+ #
152
+ #只要再执行一次以上命名, 再执行以下命令, 就可以成功.
153
+ #
154
+ #安装后执行以下命令, 检查是否安装成功.
155
+ #
156
+ #```
157
+ #python -m bitsandbytes
158
+ #```
159
+ #
160
+ #如过出现以下内容, 说明安装成功了.
161
+ #
162
+ #```text
163
+ #...
164
+ #...
165
+ #...
166
+ #Running a quick check that:
167
+ # + library is importable
168
+ # + CUDA function is callable
169
+ #
170
+ #
171
+ #WARNING: Please be sure to sanitize sensible info from any such env vars!
172
+ #
173
+ #SUCCESS!
174
+ #Installation was successful!
175
+ #```
176
+
177
+
178
+ # sh install_bitsandbytes.sh --stage 0 --stop_stage 0
179
+
180
+
181
+ verbose=true;
182
+ stage=0 # start from 0 if you need to start from data preparation
183
+ stop_stage=5
184
+
185
+
186
+ # parse options
187
+ while true; do
188
+ [ -z "${1:-}" ] && break; # break if there are no arguments
189
+ case "$1" in
190
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
191
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
192
+ old_value="(eval echo \\$$name)";
193
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
194
+ was_bool=true;
195
+ else
196
+ was_bool=false;
197
+ fi
198
+
199
+ # Set the variable to the right value-- the escaped quotes make it work if
200
+ # the option had spaces, like --cmd "queue.pl -sync y"
201
+ eval "${name}=\"$2\"";
202
+
203
+ # Check that Boolean-valued arguments are really Boolean.
204
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
205
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
206
+ exit 1;
207
+ fi
208
+ shift 2;
209
+ ;;
210
+
211
+ *) break;
212
+ esac
213
+ done
214
+
215
+
216
+ work_dir="$(pwd)"
217
+ thirdparty_dir="${work_dir}/thirdparty"
218
+
219
+ mkdir -p "${thirdparty_dir}"
220
+
221
+
222
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
223
+ $verbose && echo "stage 0: download bitsandbytes"
224
+ cd "${thirdparty_dir}" || exit 1;
225
+
226
+ wget https://github.com/TimDettmers/bitsandbytes/archive/refs/tags/0.39.1.zip
227
+ unzip 0.39.1.zip
228
+ rm -rf 0.39.1.zip
229
+
230
+ fi
231
+
232
+
script/install_conda.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # https://www.5axxw.com/questions/simple/umiecs
4
+
5
+ # params:
6
+ system_version="centos";
7
+
8
+
9
+ # parse options
10
+ while true; do
11
+ [ -z "${1:-}" ] && break; # break if there are no arguments
12
+ case "$1" in
13
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
14
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
15
+ old_value="(eval echo \\$$name)";
16
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
17
+ was_bool=true;
18
+ else
19
+ was_bool=false;
20
+ fi
21
+
22
+ # Set the variable to the right value-- the escaped quotes make it work if
23
+ # the option had spaces, like --cmd "queue.pl -sync y"
24
+ eval "${name}=\"$2\"";
25
+
26
+ # Check that Boolean-valued arguments are really Boolean.
27
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
28
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
29
+ exit 1;
30
+ fi
31
+ shift 2;
32
+ ;;
33
+
34
+ *) break;
35
+ esac
36
+ done
37
+
38
+
39
+ echo "system_version: ${system_version}";
40
+
41
+
42
+ if [ ${system_version} = "centos" ]; then
43
+ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
44
+
45
+ bash Miniconda3-latest-Linux-x86_64.sh
46
+
47
+ /usr/local/miniconda3/bin/conda --version
48
+
49
+ cat ~/.bashrc
50
+ echo "PATH=$PATH:/usr/local/miniconda3/bin" >> /root/.bashrc
51
+ source ~/.bashrc
52
+
53
+ conda --version
54
+
55
+ fi
56
+
57
+
58
+
script/install_cuda.sh ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # 查看系统架构 Architecture
4
+ # >>> uname -a
5
+ # Linux nlp 3.10.0-1160.66.1.el7.x86_64 #1 SMP Wed May 18 16:02:34 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
6
+ # >>> uname -m
7
+ # x86_64
8
+
9
+
10
+ #cuda驱动就像普通的软件一样, 可以安装多个.
11
+
12
+
13
+ #在以下路径找到对应版本, 获得安装命令.
14
+ #https://developer.nvidia.com/cuda-toolkit-archive
15
+ #
16
+ #参考链接:
17
+ #https://www.cnblogs.com/yuezc/p/12937239.html
18
+ #https://blog.csdn.net/pursuit_zhangyu/article/details/117073126
19
+ #
20
+ #[root@nlp dep]# sh cuda_10.2.89_440.33.01_linux.run --override
21
+ #(执行以上命令后, 安提示操作, 以下是安装完成后的信息).
22
+ #===========
23
+ #= Summary =
24
+ #===========
25
+ #
26
+ #Driver: Installed
27
+ #Toolkit: Installed in /usr/local/cuda-10.2/
28
+ #Samples: Installed in /home/admin/, but missing recommended libraries
29
+ #
30
+ #Please make sure that
31
+ # - PATH includes /usr/local/cuda-10.2/bin
32
+ # - LD_LIBRARY_PATH includes /usr/local/cuda-10.2/lib64, or, add /usr/local/cuda-10.2/lib64 to /etc/ld.so.conf and run ldconfig as root
33
+ #
34
+ #To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-10.2/bin
35
+ #To uninstall the NVIDIA Driver, run nvidia-uninstall
36
+ #
37
+ #Please see CUDA_Installation_Guide_Linux.pdf in /usr/local/cuda-10.2/doc/pdf for detailed information on setting up CUDA.
38
+ #Logfile is /var/log/cuda-installer.log
39
+
40
+
41
+ # params:
42
+ system_version="centos";
43
+
44
+
45
+ # parse options
46
+ while true; do
47
+ [ -z "${1:-}" ] && break; # break if there are no arguments
48
+ case "$1" in
49
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
50
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
51
+ old_value="(eval echo \\$$name)";
52
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
53
+ was_bool=true;
54
+ else
55
+ was_bool=false;
56
+ fi
57
+
58
+ # Set the variable to the right value-- the escaped quotes make it work if
59
+ # the option had spaces, like --cmd "queue.pl -sync y"
60
+ eval "${name}=\"$2\"";
61
+
62
+ # Check that Boolean-valued arguments are really Boolean.
63
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
64
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
65
+ exit 1;
66
+ fi
67
+ shift 2;
68
+ ;;
69
+
70
+ *) break;
71
+ esac
72
+ done
73
+
74
+
75
+ echo "system_version: ${system_version}";
76
+
77
+
78
+ if [ ${system_version} = "centos" ]; then
79
+ #runfile(local)
80
+ wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
81
+ sudo sh cuda_11.7.0_515.43.04_linux.run --override
82
+
83
+ #只选择安装 CUDA Toolkit 11.7 其它取消选择.
84
+
85
+ rm -rf /usr/local/cuda
86
+ ln -snf /usr/local/cuda-11.7 /usr/local/cuda
87
+
88
+ #export CUDA_HOME=/usr/local/cuda
89
+ #export PATH="${CUDA_HOME}/bin${PATH:+:$PATH}"
90
+ #export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
91
+
92
+ #export PATH=$PATH:/usr/local/cuda/bin
93
+ #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
94
+
95
+ cat ~/.bashrc
96
+ echo "PATH=$PATH:/usr/local/cuda/bin" >> /root/.bashrc
97
+ echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64" >> /root/.bashrc
98
+ source ~/.bashrc
99
+
100
+ #查看cuda版本
101
+ nvcc -V
102
+
103
+ fi
script/install_nvidia_driver.sh ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #GPU驱动安装需要先将原有的显示关闭, 重启机器, 再进行安装.
3
+ #参考链接:
4
+ #https://blog.csdn.net/kingschan/article/details/19033595
5
+ #https://blog.csdn.net/HaixWang/article/details/90408538
6
+ #
7
+ #>>> yum install -y pciutils
8
+ #查看 linux 机器上是否有 GPU
9
+ #lspci |grep -i nvidia
10
+ #
11
+ #>>> lspci |grep -i nvidia
12
+ #00:08.0 3D controller: NVIDIA Corporation TU104GL [Tesla T4] (rev a1)
13
+ #
14
+ #
15
+ #NVIDIA 驱动程序下载
16
+ #先在 pytorch 上查看应该用什么 cuda 版本, 再安装对应的 cuda-toolkit cuda.
17
+ #再根据 gpu 版本下载安装对应的 nvidia 驱动
18
+ #
19
+ ## pytorch 版本
20
+ #https://pytorch.org/get-started/locally/
21
+ #
22
+ ## CUDA 下载 (好像不需要这个)
23
+ #https://developer.nvidia.com/cuda-toolkit-archive
24
+ #
25
+ ## nvidia 驱动
26
+ #https://www.nvidia.cn/Download/index.aspx?lang=cn
27
+ #http://www.nvidia.com/Download/index.aspx
28
+ #
29
+ #在下方的下拉列表中进行选择,针对您的 NVIDIA 产品确定合适的驱动。
30
+ #产品类型:
31
+ #Data Center / Tesla
32
+ #产品系列:
33
+ #T-Series
34
+ #产品家族:
35
+ #Tesla T4
36
+ #操作系统:
37
+ #Linux 64-bit
38
+ #CUDA Toolkit:
39
+ #10.2
40
+ #语言:
41
+ #Chinese (Simpleified)
42
+ #
43
+ #
44
+ #>>> mkdir -p /data/tianxing
45
+ #>>> cd /data/tianxing
46
+ #>>> wget https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
47
+ #>>> sh NVIDIA-Linux-x86_64-440.118.02.run
48
+ #
49
+ ## 异常:
50
+ #ERROR: The Nouveau kernel driver is currently in use by your system. This driver is incompatible with the NVIDIA driver, and must be disabled before proceeding. Please consult the NVIDIA driver README and your
51
+ #Linux distribution's documentation for details on how to correctly disable the Nouveau kernel driver.
52
+ #[OK]
53
+ #
54
+ #For some distributions, Nouveau can be disabled by adding a file in the modprobe configuration directory. Would you like nvidia-installer to attempt to create this modprobe file for you?
55
+ #[NO]
56
+ #
57
+ #ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download
58
+ #page at www.nvidia.com.
59
+ #[OK]
60
+ #
61
+ ## 参考链接:
62
+ #https://blog.csdn.net/kingschan/article/details/19033595
63
+ #
64
+ ## 禁用原有的显卡驱动 nouveau
65
+ #>>> echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
66
+ #>>> sudo dracut --force
67
+ ## 重启
68
+ #>>> reboot
69
+ #
70
+ #>>> init 3
71
+ #>>> sh NVIDIA-Linux-x86_64-440.118.02.run
72
+ #
73
+ ## 异常
74
+ #ERROR: Unable to find the kernel source tree for the currently running kernel. Please make sure you have installed the kernel source files for your kernel and that they are properly configured; on Red Hat Linux systems, for example, be sure you have the 'kernel-source' or 'kernel-devel' RPM installed. If you know the correct kernel source files are installed, you may specify the kernel source path with the '--kernel-source-path' command line option.
75
+ #[OK]
76
+ #ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download
77
+ #page at www.nvidia.com.
78
+ #[OK]
79
+ #
80
+ ## 参考链接
81
+ ## https://blog.csdn.net/HaixWang/article/details/90408538
82
+ #
83
+ #>>> uname -r
84
+ #3.10.0-1160.49.1.el7.x86_64
85
+ #>>> yum install kernel-devel kernel-headers -y
86
+ #>>> yum info kernel-devel kernel-headers
87
+ #>>> yum install -y "kernel-devel-uname-r == $(uname -r)"
88
+ #>>> yum -y distro-sync
89
+ #
90
+ #>>> sh NVIDIA-Linux-x86_64-440.118.02.run
91
+ #
92
+ ## 安装成功
93
+ #WARNING: nvidia-installer was forced to guess the X library path '/usr/lib64' and X module path '/usr/lib64/xorg/modules'; these paths were not queryable from the system. If X fails to find the NVIDIA X driver
94
+ #module, please install the `pkg-config` utility and the X.Org SDK/development package for your distribution and reinstall the driver.
95
+ #[OK]
96
+ #Install NVIDIA's 32-bit compatibility libraries?
97
+ #[YES]
98
+ #Installation of the kernel module for the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version 440.118.02) is now complete.
99
+ #[OK]
100
+ #
101
+ #
102
+ ## 查看 GPU 使用情况; watch -n 1 -d nvidia-smi 每1秒刷新一次.
103
+ #>>> nvidia-smi
104
+ #Thu Mar 9 12:00:37 2023
105
+ #+-----------------------------------------------------------------------------+
106
+ #| NVIDIA-SMI 440.118.02 Driver Version: 440.118.02 CUDA Version: 10.2 |
107
+ #|-------------------------------+----------------------+----------------------+
108
+ #| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
109
+ #| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
110
+ #|===============================+======================+======================|
111
+ #| 0 Tesla T4 Off | 00000000:00:08.0 Off | Off |
112
+ #| N/A 54C P0 22W / 70W | 0MiB / 16127MiB | 0% Default |
113
+ #+-------------------------------+----------------------+----------------------+
114
+ #
115
+ #+-----------------------------------------------------------------------------+
116
+ #| Processes: GPU Memory |
117
+ #| GPU PID Type Process name Usage |
118
+ #|=============================================================================|
119
+ #| No running processes found |
120
+ #+-----------------------------------------------------------------------------+
121
+ #
122
+ #
123
+
124
+ # params
125
+ stage=1
126
+ nvidia_driver_filename=https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
127
+
128
+ # parse options
129
+ while true; do
130
+ [ -z "${1:-}" ] && break; # break if there are no arguments
131
+ case "$1" in
132
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
133
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
134
+ old_value="(eval echo \\$$name)";
135
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
136
+ was_bool=true;
137
+ else
138
+ was_bool=false;
139
+ fi
140
+
141
+ # Set the variable to the right value-- the escaped quotes make it work if
142
+ # the option had spaces, like --cmd "queue.pl -sync y"
143
+ eval "${name}=\"$2\"";
144
+
145
+ # Check that Boolean-valued arguments are really Boolean.
146
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
147
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
148
+ exit 1;
149
+ fi
150
+ shift 2;
151
+ ;;
152
+
153
+ *) break;
154
+ esac
155
+ done
156
+
157
+ echo "stage: ${stage}";
158
+
159
+ yum -y install wget
160
+ yum -y install sudo
161
+
162
+ if [ ${stage} -eq 0 ]; then
163
+ mkdir -p /data/dep
164
+ cd /data/dep || echo 1;
165
+ wget -P /data/dep ${nvidia_driver_filename}
166
+
167
+ echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
168
+ sudo dracut --force
169
+ # 重启
170
+ reboot
171
+ elif [ ${stage} -eq 1 ]; then
172
+ init 3
173
+
174
+ yum install -y kernel-devel kernel-headers
175
+ yum info kernel-devel kernel-headers
176
+ yum install -y "kernel-devel-uname-r == $(uname -r)"
177
+ yum -y distro-sync
178
+
179
+ cd /data/dep || echo 1;
180
+
181
+ # 安装时, 需要回车三下.
182
+ sh NVIDIA-Linux-x86_64-440.118.02.run
183
+ nvidia-smi
184
+ fi
script/install_openssl.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # params:
4
+ system_version="centos";
5
+
6
+
7
+ # parse options
8
+ while true; do
9
+ [ -z "${1:-}" ] && break; # break if there are no arguments
10
+ case "$1" in
11
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
12
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
13
+ old_value="(eval echo \\$$name)";
14
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
15
+ was_bool=true;
16
+ else
17
+ was_bool=false;
18
+ fi
19
+
20
+ # Set the variable to the right value-- the escaped quotes make it work if
21
+ # the option had spaces, like --cmd "queue.pl -sync y"
22
+ eval "${name}=\"$2\"";
23
+
24
+ # Check that Boolean-valued arguments are really Boolean.
25
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
26
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
27
+ exit 1;
28
+ fi
29
+ shift 2;
30
+ ;;
31
+
32
+ *) break;
33
+ esac
34
+ done
35
+
36
+ echo "system_version: ${system_version}";
37
+
38
+
39
+ if [ ${system_version} = "centos" ]; then
40
+ mkdir -p /data/dep
41
+ cd /data/dep || exit 1;
42
+
43
+ if [ ! -e openssl-1.1.1n.tar.gz ]; then
44
+ wget https://www.openssl.org/source/openssl-1.1.1n.tar.gz --no-check-certificate
45
+ fi
46
+
47
+ cd /data/dep || exit 1;
48
+ if [ ! -d openssl-1.1.1n ]; then
49
+ tar -zxvf openssl-1.1.1n.tar.gz
50
+
51
+ cd /data/dep/openssl-1.1.1n || exit 1;
52
+ fi
53
+
54
+ ./Configure --prefix=/usr/local/openssl
55
+
56
+ make -j && make install
57
+
58
+ fi
script/install_python.sh ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # sh ./script/install_python.sh --system_version "centos" --python_version "3.10.11"
4
+
5
+ # 参数:
6
+ python_version="3.8.10";
7
+ system_version="centos";
8
+
9
+
10
+ # parse options
11
+ while true; do
12
+ [ -z "${1:-}" ] && break; # break if there are no arguments
13
+ case "$1" in
14
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
15
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
16
+ old_value="(eval echo \\$$name)";
17
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
18
+ was_bool=true;
19
+ else
20
+ was_bool=false;
21
+ fi
22
+
23
+ # Set the variable to the right value-- the escaped quotes make it work if
24
+ # the option had spaces, like --cmd "queue.pl -sync y"
25
+ eval "${name}=\"$2\"";
26
+
27
+ # Check that Boolean-valued arguments are really Boolean.
28
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
29
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
30
+ exit 1;
31
+ fi
32
+ shift 2;
33
+ ;;
34
+
35
+ *) break;
36
+ esac
37
+ done
38
+
39
+ echo "python_version: ${python_version}";
40
+ echo "system_version: ${system_version}";
41
+
42
+
43
+ if [ ${system_version} = "centos" ]; then
44
+ # 安装 python 开发编译环境
45
+ yum -y groupinstall "Development tools"
46
+ yum -y install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel
47
+ yum install libffi-devel -y
48
+ yum install -y wget
49
+ yum install -y make
50
+
51
+ mkdir /data/dep
52
+ # wget -P /data/dep https://www.python.org/ftp/python/3.10.11/Python-3.10.11.tgz
53
+ wget -P /data/dep https://www.python.org/ftp/python/${python_version}/Python-${python_version}.tgz
54
+
55
+ cd /data/dep || exit 1;
56
+ tar -zxvf Python-${python_version}.tgz
57
+ cd /data/dep/Python-${python_version} || exit 1;
58
+
59
+ mkdir /usr/local/python-${python_version}
60
+ ./configure --prefix=/usr/local/python-${python_version}
61
+ make && make install
62
+
63
+ # /usr/local/python-3.10.11/bin/python3 -V
64
+ # /usr/local/python-3.10.11/bin/pip3 -V
65
+ /usr/local/python-${python_version}/bin/python3 -V
66
+ /usr/local/python-${python_version}/bin/pip3 -V
67
+
68
+ rm -rf /usr/local/bin/python3
69
+ rm -rf /usr/local/bin/pip3
70
+ # ln -s /usr/local/python-3.10.11/bin/python3 /usr/local/bin/python3
71
+ # ln -s /usr/local/python-3.10.11/bin/pip3 /usr/local/bin/pip3
72
+ ln -s /usr/local/python-${python_version}/bin/python3 /usr/local/bin/python3
73
+ ln -s /usr/local/python-${python_version}/bin/pip3 /usr/local/bin/pip3
74
+
75
+ python3 -V
76
+ pip3 -V
77
+
78
+ elif [ ${system_version} = "ubuntu" ]; then
79
+ # 安装 python 开发编译环境
80
+ # https://zhuanlan.zhihu.com/p/506491209
81
+
82
+ # 刷新软件包目录
83
+ sudo apt update
84
+ # 列出当前可用的更新
85
+ sudo apt list --upgradable
86
+ # 如上一步提示有可以更新的项目,则执行更新
87
+ sudo apt -y upgrade
88
+ # 安装 GCC 编译器
89
+ sudo apt install gcc
90
+ # 检查安装是否成功
91
+ gcc -v
92
+
93
+ # 安装依赖
94
+ sudo apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev
95
+
96
+ mkdir /data/dep
97
+
98
+ # sudo wget -P /data/dep https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tgz
99
+ sudo wget -P /data/dep https://www.python.org/ftp/python/${python_version}/Python-${python_version}.tgz
100
+
101
+ cd /data/dep || exit 1;
102
+ # tar -zxvf Python-3.6.5.tgz
103
+ tar -zxvf Python-${python_version}.tgz
104
+ # cd /data/dep/Python-3.6.5
105
+ cd /data/dep/Python-${python_version} || exit 1;
106
+ # mkdir /usr/local/python-3.6.5
107
+ mkdir /usr/local/python-${python_version}
108
+
109
+ # 检查依赖与配置编译
110
+ # sudo ./configure --prefix=/usr/local/python-3.6.5 --enable-optimizations --with-lto --enable-shared
111
+ sudo ./configure --prefix=/usr/local/python-${python_version} --enable-optimizations --with-lto --enable-shared
112
+ cpu_count=$(cat /proc/cpuinfo | grep processor | wc -l)
113
+ # sudo make -j 4
114
+ sudo make -j "${cpu_count}"
115
+
116
+ /usr/local/python-${python_version}/bin/python3 -V
117
+ /usr/local/python-${python_version}/bin/pip3 -V
118
+
119
+ rm -rf /usr/local/bin/python3
120
+ rm -rf /usr/local/bin/pip3
121
+ ln -s /usr/local/python-${python_version}/bin/python3 /usr/local/bin/python3
122
+ ln -s /usr/local/python-${python_version}/bin/pip3 /usr/local/bin/pip3
123
+
124
+ python3 -V
125
+ pip3 -V
126
+ fi
toolbox/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/json/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/json/misc.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable
4
+
5
+
6
+ def traverse(js, callback: Callable, *args, **kwargs):
7
+ if isinstance(js, list):
8
+ result = list()
9
+ for l in js:
10
+ l = traverse(l, callback, *args, **kwargs)
11
+ result.append(l)
12
+ return result
13
+ elif isinstance(js, tuple):
14
+ result = list()
15
+ for l in js:
16
+ l = traverse(l, callback, *args, **kwargs)
17
+ result.append(l)
18
+ return tuple(result)
19
+ elif isinstance(js, dict):
20
+ result = dict()
21
+ for k, v in js.items():
22
+ k = traverse(k, callback, *args, **kwargs)
23
+ v = traverse(v, callback, *args, **kwargs)
24
+ result[k] = v
25
+ return result
26
+ elif isinstance(js, int):
27
+ return callback(js, *args, **kwargs)
28
+ elif isinstance(js, str):
29
+ return callback(js, *args, **kwargs)
30
+ else:
31
+ return js
32
+
33
+
34
+ def demo1():
35
+ d = {
36
+ "env": "ppe",
37
+ "mysql_connect": {
38
+ "host": "$mysql_connect_host",
39
+ "port": 3306,
40
+ "user": "callbot",
41
+ "password": "NxcloudAI2021!",
42
+ "database": "callbot_ppe",
43
+ "charset": "utf8"
44
+ },
45
+ "es_connect": {
46
+ "hosts": ["10.20.251.8"],
47
+ "http_auth": ["elastic", "ElasticAI2021!"],
48
+ "port": 9200
49
+ }
50
+ }
51
+
52
+ def callback(s):
53
+ if isinstance(s, str) and s.startswith('$'):
54
+ return s[1:]
55
+ return s
56
+
57
+ result = traverse(d, callback=callback)
58
+ print(result)
59
+ return
60
+
61
+
62
+ if __name__ == '__main__':
63
+ demo1()
toolbox/os/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/os/environment.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ import os
5
+
6
+ from dotenv import load_dotenv
7
+ from dotenv.main import DotEnv
8
+
9
+ from toolbox.json.misc import traverse
10
+
11
+
12
+ class EnvironmentManager(object):
13
+ def __init__(self, path, env, override=False):
14
+ filename = os.path.join(path, '{}.env'.format(env))
15
+ self.filename = filename
16
+
17
+ load_dotenv(
18
+ dotenv_path=filename,
19
+ override=override
20
+ )
21
+
22
+ self._environ = dict()
23
+
24
+ def open_dotenv(self, filename: str = None):
25
+ filename = filename or self.filename
26
+ dotenv = DotEnv(
27
+ dotenv_path=filename,
28
+ stream=None,
29
+ verbose=False,
30
+ interpolate=False,
31
+ override=False,
32
+ encoding="utf-8",
33
+ )
34
+ result = dotenv.dict()
35
+ return result
36
+
37
+ def get(self, key, default=None, dtype=str):
38
+ result = os.environ.get(key)
39
+ if result is None:
40
+ if default is None:
41
+ result = None
42
+ else:
43
+ result = default
44
+ else:
45
+ result = dtype(result)
46
+ self._environ[key] = result
47
+ return result
48
+
49
+
50
+ _DEFAULT_DTYPE_MAP = {
51
+ 'int': int,
52
+ 'float': float,
53
+ 'str': str,
54
+ 'json.loads': json.loads
55
+ }
56
+
57
+
58
+ class JsonConfig(object):
59
+ """
60
+ 将 json 中, 形如 `$float:threshold` 的值, 处理为:
61
+ 从环境变量中查到 threshold, 再将其转换为 float 类型.
62
+ """
63
+ def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
64
+ self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
65
+ self.environment = environment or os.environ
66
+
67
+ def sanitize_by_filename(self, filename: str):
68
+ with open(filename, 'r', encoding='utf-8') as f:
69
+ js = json.load(f)
70
+
71
+ return self.sanitize_by_json(js)
72
+
73
+ def sanitize_by_json(self, js):
74
+ js = traverse(
75
+ js,
76
+ callback=self.sanitize,
77
+ environment=self.environment
78
+ )
79
+ return js
80
+
81
+ def sanitize(self, string, environment):
82
+ """支持 $ 符开始的, 环境变量配置"""
83
+ if isinstance(string, str) and string.startswith('$'):
84
+ dtype, key = string[1:].split(':')
85
+ dtype = self.dtype_map[dtype]
86
+
87
+ value = environment.get(key)
88
+ if value is None:
89
+ raise AssertionError('environment not exist. key: {}'.format(key))
90
+
91
+ value = dtype(value)
92
+ result = value
93
+ else:
94
+ result = string
95
+ return result
96
+
97
+
98
+ def demo1():
99
+ import json
100
+
101
+ from project_settings import project_path
102
+
103
+ environment = EnvironmentManager(
104
+ path=os.path.join(project_path, 'server/callbot_server/dotenv'),
105
+ env='dev',
106
+ )
107
+ init_scenes = environment.get(key='init_scenes', dtype=json.loads)
108
+ print(init_scenes)
109
+ print(environment._environ)
110
+ return
111
+
112
+
113
+ if __name__ == '__main__':
114
+ demo1()
toolbox/os/other.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inspect
3
+
4
+
5
+ def pwd():
6
+ """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
7
+ frame = inspect.stack()[1]
8
+ module = inspect.getmodule(frame[0])
9
+ return os.path.dirname(os.path.abspath(module.__file__))
toolbox/transformers/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/transformers/data/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/transformers/data/data_collator.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Any, Dict, List
4
+ import torch
5
+
6
+
7
+ class SFTDataCollator(object):
8
+ def __init__(self, tokenizer, max_seq_length):
9
+ self.tokenizer = tokenizer
10
+ self.max_seq_length = max_seq_length
11
+ self.pad_token_id = tokenizer.pad_token_id
12
+
13
+ def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
14
+ # 找出batch中的最大长度.
15
+ lengths = [len(x['input_ids']) for x in batch]
16
+ # 取出batch中的最大长度, 如果超过max_seq_length, 则取max_seq_length.
17
+ batch_max_len = min(max(lengths), self.max_seq_length)
18
+ # batch_max_len = self.max_seq_length
19
+
20
+ input_ids_batch, attention_mask_batch, target_mask_batch = [], [], []
21
+ # truncate and padding
22
+ for x in batch:
23
+ input_ids = x["input_ids"]
24
+ attention_mask = x["attention_mask"]
25
+ target_mask = x["target_mask"]
26
+
27
+ padding_len = batch_max_len - len(input_ids)
28
+ # padding
29
+ input_ids = input_ids + [self.pad_token_id] * padding_len
30
+ attention_mask = attention_mask + [0] * padding_len
31
+ target_mask = target_mask + [0] * padding_len
32
+ # truncate
33
+ input_ids = input_ids[:self.max_seq_length]
34
+ attention_mask = attention_mask[:self.max_seq_length]
35
+ target_mask = target_mask[:self.max_seq_length]
36
+
37
+ input_ids_batch.append(input_ids)
38
+ attention_mask_batch.append(attention_mask)
39
+ target_mask_batch.append(target_mask)
40
+
41
+ # 将list转换为tensor, 得到最终的的模型输入.
42
+ input_ids_batch = torch.tensor(input_ids_batch, dtype=torch.long)
43
+ attention_mask_batch = torch.tensor(attention_mask_batch, dtype=torch.long)
44
+ target_mask_batch = torch.tensor(target_mask_batch, dtype=torch.long)
45
+ inputs = {
46
+ "input_ids": input_ids_batch,
47
+ "attention_mask": attention_mask_batch,
48
+ "target_mask": target_mask_batch
49
+ }
50
+ return inputs
51
+
52
+
53
+ if __name__ == '__main__':
54
+ pass
toolbox/transformers/data/dataset/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/transformers/data/dataset/dataset.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ from typing import List
5
+
6
+ from torch.utils.data import Dataset
7
+
8
+
9
+ class SFTDataset(Dataset):
10
+ def __init__(self):
11
+ self.samples: List[dict] = list()
12
+
13
+ def read(self, filename: str):
14
+ samples = list()
15
+ with open(filename, "r", encoding="utf-8") as f:
16
+ for row in f:
17
+ row = str(row).strip()
18
+ row = json.loads(row)
19
+ samples.append(row)
20
+ self.samples = samples
21
+ return self
22
+
23
+ def __getitem__(self, index):
24
+ sample = self.samples[index]
25
+ return sample
26
+
27
+ def __len__(self):
28
+ return len(self.samples)
29
+
30
+
31
+ class ChatGLM2SFTDataset(SFTDataset):
32
+ def __init__(self, tokenizer, max_seq_length: int):
33
+ super(ChatGLM2SFTDataset, self).__init__()
34
+ self.tokenizer = tokenizer
35
+ self.max_seq_length = max_seq_length
36
+
37
+ self.input_format = '[Round {}]\n\n问:{}\n\n答:'
38
+ self.target_format = "{}"
39
+
40
+ def __getitem__(self, index):
41
+ sample = self.samples[index]
42
+
43
+ conversation = sample["conversation"]
44
+
45
+ utterances = list()
46
+ for i, x in enumerate(conversation):
47
+ human = self.input_format.format(i+1, x["human"])
48
+ assistant = self.target_format.format(x["assistant"])
49
+ utterances += ([human, assistant])
50
+ utterances_ids = self.tokenizer(utterances, add_special_tokens=False).input_ids
51
+
52
+ input_ids = list()
53
+ target_mask = list()
54
+ for i, utterances_id in enumerate(utterances_ids):
55
+ input_ids += utterances_id
56
+ if i % 2 == 0:
57
+ target_mask += [0] * (len(utterances_id))
58
+ else:
59
+ input_ids += [self.tokenizer.eos_token_id]
60
+ target_mask += [1] * (len(utterances_id) + 1)
61
+
62
+ assert len(input_ids) == len(target_mask)
63
+
64
+ input_ids = input_ids[:self.max_seq_length]
65
+ target_mask = target_mask[:self.max_seq_length]
66
+ attention_mask = [1] * len(input_ids)
67
+
68
+ assert len(input_ids) == len(target_mask) == len(attention_mask)
69
+
70
+ inputs = {
71
+ "input_ids": input_ids,
72
+ "attention_mask": attention_mask,
73
+ "target_mask": target_mask
74
+ }
75
+ return inputs
76
+
77
+
78
+ if __name__ == '__main__':
79
+ pass
toolbox/transformers/modules/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/transformers/modules/loss.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class Loss(object):
8
+ """
9
+ 所有loss的类父类
10
+ """
11
+ def __call__(self, model, inputs, training_args, return_outputs=False):
12
+ """
13
+ 用于计算loss.
14
+ 看源码发现, return_outputs=True为train时调用, return_outputs=False为eval和predict调用
15
+ :param model: 模型
16
+ :param inputs: 模型输入, dict
17
+ :param training_args: 训练配置参数
18
+ :param return_outputs: 是否返回模型的输出
19
+ :return:
20
+ """
21
+ raise NotImplemented
22
+
23
+
24
+ class TargetLMLoss(Loss):
25
+
26
+ def __init__(self, ignore_index):
27
+ super().__init__()
28
+ self.ignore_index = ignore_index
29
+ self.loss_fn = nn.CrossEntropyLoss(ignore_index=ignore_index)
30
+
31
+ def __call__(self, model, inputs, training_args, return_outputs=False):
32
+ input_ids = inputs["input_ids"]
33
+ attention_mask = inputs["attention_mask"]
34
+ target_mask = inputs["target_mask"]
35
+
36
+ # 模型前馈预测
37
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
38
+ logits = outputs["logits"] if isinstance(outputs, dict) else outputs[0]
39
+
40
+ # 将labels中不属于target的部分, 设为ignore_index, 只计算target部分的loss.
41
+ labels = torch.where(target_mask == 1, input_ids, self.ignore_index)
42
+ shift_logits = logits[..., :-1, :].contiguous()
43
+ shift_labels = labels[..., 1:].contiguous()
44
+
45
+ # Flatten the tokens
46
+ loss = self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
47
+ return (loss, outputs) if return_outputs else loss
48
+
49
+
50
+ if __name__ == '__main__':
51
+ pass
toolbox/transformers/trainer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import transformers
4
+ from transformers import (
5
+ PreTrainedModel,
6
+ TrainingArguments,
7
+ DataCollator,
8
+ PreTrainedTokenizerBase,
9
+ EvalPrediction,
10
+ TrainerCallback,
11
+ )
12
+ from typing import Callable, Dict, List, Optional, Tuple, Union, Any
13
+ from torch import nn
14
+ from torch.utils.data import Dataset, DataLoader
15
+ from transformers.utils import (
16
+ logging,
17
+ )
18
+ from typing import Optional
19
+ import os
20
+ import torch
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ # Name of the files used for checkpointing
26
+ TRAINING_ARGS_NAME = "training_args.bin"
27
+ TRAINER_STATE_NAME = "trainer_state.json"
28
+ OPTIMIZER_NAME = "optimizer.pt"
29
+ SCHEDULER_NAME = "scheduler.pt"
30
+ SCALER_NAME = "scaler.pt"
31
+
32
+
33
+ class Trainer(transformers.Trainer):
34
+ """
35
+ 主要修改逻辑: 通过传入compute_loss, 支持自定义loss计算方式.
36
+ """
37
+ def __init__(
38
+ self,
39
+ model: Union[PreTrainedModel, nn.Module] = None,
40
+ args: TrainingArguments = None,
41
+ data_collator: Optional[DataCollator] = None,
42
+ train_dataset: Optional[Dataset] = None,
43
+ eval_dataset: Optional[Dataset] = None,
44
+ tokenizer: Optional[PreTrainedTokenizerBase] = None,
45
+ model_init: Callable[[], PreTrainedModel] = None,
46
+ compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
47
+ callbacks: Optional[List[TrainerCallback]] = None,
48
+ optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
49
+ preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
50
+ compute_loss=None,
51
+ ):
52
+ super(Trainer, self).__init__(
53
+ model=model,
54
+ args=args,
55
+ data_collator=data_collator,
56
+ train_dataset=train_dataset,
57
+ eval_dataset=eval_dataset,
58
+ tokenizer=tokenizer,
59
+ model_init=model_init,
60
+ compute_metrics=compute_metrics,
61
+ callbacks=callbacks,
62
+ optimizers=optimizers,
63
+ preprocess_logits_for_metrics=preprocess_logits_for_metrics,
64
+ )
65
+ self.loss_func = compute_loss
66
+
67
+ def compute_loss(self, model, inputs, return_outputs=False):
68
+ """
69
+ 重写loss的计算方式
70
+ How the loss is computed by Trainer. By default, all models return the loss in the first element.
71
+
72
+ Subclass and override for custom behavior.
73
+ """
74
+ return self.loss_func(model, inputs, self.args, return_outputs)
75
+
76
+
77
+ class LoRATrainer(Trainer):
78
+ """
79
+ 修改checkpoint的保存逻辑, 只保存lora.
80
+ """
81
+ def _save(self, output_dir: Optional[str] = None, state_dict=None):
82
+ # If we are executing this function, we are the process zero, so we don't check for that.
83
+ output_dir = output_dir if output_dir is not None else self.args.output_dir
84
+ os.makedirs(output_dir, exist_ok=True)
85
+ logger.info(f"Saving model checkpoint to {output_dir}")
86
+ # 保存lora权重和配置
87
+ self.model.save_pretrained(
88
+ output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
89
+ )
90
+
91
+ if self.tokenizer is not None:
92
+ self.tokenizer.save_pretrained(output_dir)
93
+
94
+ # Good practice: save your training arguments together with the trained model
95
+ torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
96
+
97
+
98
+ if __name__ == '__main__':
99
+ pass