--- datasets: - kinokokoro/ichikara-instruction-003 language: - ja base_model: - llm-jp/llm-jp-3-13b --- elyza-tasks-100-TV_0.jsonl の回答モデルの作成のためのコードです。 サンプルコードに対して以下の変更を行いスコア改善を試みました。 - データセットを ichikara-instruction-003 の全てのファイルを利用するよう変更 - 学習率(learning_rate) を 2e-5へ変更 - 累積勾配(gradient_accumulation_steps) を 4 に変更 - RoRAのRANK(LoraConfig r)を 32 に変更 自宅のPC(RTX3090) でコードを実行し、解答を出力しました。 ```python import wandb import os WANDB_API_KEY = "my-token" wandb.login(key=WANDB_API_KEY) wandb.init(project='llm2024-competition') HF_TOKEN = "my-token" from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, logging, ) from peft import ( LoraConfig, PeftModel, get_peft_model, ) import os, torch, gc from datasets import load_dataset import bitsandbytes as bnb from trl import SFTTrainer SEED_VALUE = 42 base_model_id = "llm-jp/llm-jp-3-13b" new_model_id = "llm-jp-3-13b-finetune" #Fine-Tuningしたモデルにつけたい名前 bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", # nf4は通常のINT4より精度が高く、ニューラルネットワークの分布に最適です bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained( base_model_id, quantization_config=bnb_config, device_map="cuda:0" #auto" ) tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True) def find_all_linear_names(model): cls = bnb.nn.Linear4bit # 4bit量子化線形層クラスを指定 lora_module_names = set() # ここに取得した線形層を保持します。 # モデル内の全てのモジュールを探索します for name, module in model.named_modules(): if isinstance(module, cls): # モジュールが4bit量子化線形層の場合 names = name.split('.') # モジュールの名前を分割 (ネストされてる際などに対処) lora_module_names.add(names[0] if len(names) == 1 else names[-1]) # 最下層の名前をlora_module_namesに追加 # 'lm_head' は16ビット演算の際に除外する必要があるため、lora_module_namesから削除 if 'lm_head' in lora_module_names: lora_module_names.remove('lm_head') return list(lora_module_names) # lora_module_namesをリストに変換して返します。 modules = find_all_linear_names(model) peft_config = LoraConfig( r=32, #16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=modules, ) model = get_peft_model(model, peft_config) from datasets import concatenate_datasets, DatasetDict # 全てのデータセットを読み込み dataset0 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-1.json") dataset1 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-1.json") dataset2 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-2.2.json") dataset3 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-5.2.json") dataset4 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-2.1.json") dataset5 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-5.1.json") dataset6 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-002-1.json") dataset7 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-003-1.json") datasets_to_concatenate = [ dataset0["train"], dataset1["train"], dataset2["train"], dataset3["train"], dataset4["train"], dataset5["train"], dataset6["train"], dataset7["train"] ] concatenated_train_dataset = concatenate_datasets(datasets_to_concatenate) dataset_all = DatasetDict({ "train": concatenated_train_dataset }) # 結合したデータを使用 dataset=dataset_all # 学習時のプロンプトフォーマットの定義 prompt = """### 指示 {} ### 回答 {}""" """ formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる """ EOS_TOKEN = tokenizer.eos_token # トークナイザーのEOSトークン(文末トークン) def formatting_prompts_func(examples): input = examples["text"] # 入力データ output = examples["output"] # 出力データ text = prompt.format(input, output) + EOS_TOKEN # プロンプトの作成 return { "formatted_text" : text, } # 新しいフィールド "formatted_text" を返す pass # # 各データにフォーマットを適用 dataset = dataset.map( formatting_prompts_func, num_proc= 4, # 並列処理数を指定 ) # データをtrainデータとtestデータに分割 (test_sizeの比率に) dataset = dataset["train"].train_test_split(test_size=0.1, seed=SEED_VALUE) training_arguments = TrainingArguments( output_dir=new_model_id, per_device_train_batch_size=1, # gradient_accumulation_steps=4, # def: 2 optim="paged_adamw_32bit", num_train_epochs=1, # def: 1 logging_strategy="steps", logging_steps=10, warmup_steps=10, save_steps=100, save_total_limit = 2, max_steps = -1, # def:-1 learning_rate=2e-5, # def:5e-5, fp16= False, bf16= False, seed = SEED_VALUE, group_by_length=True, report_to="wandb" ) trainer = SFTTrainer( model=model, train_dataset=dataset["train"], peft_config=peft_config, max_seq_length= 512, dataset_text_field="formatted_text", tokenizer=tokenizer, args=training_arguments, packing= False, ) model.config.use_cache = False # キャッシュ機能を無効化 trainer.train() # トレーニングを実行 from datetime import datetime # 現在の日時を取得 now = datetime.now() # フォーマットを指定して日時を文字列に変換 formatted_date = now.strftime("%Y%m%d_%H%M%S") # 例: "20241214_153045" print(formatted_date) # タスクとなるデータの読み込み。 # omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行。 import json datasets = [] with open("./elyza-tasks-100-TV_0.jsonl", "r") as f: item = "" for line in f: line = line.strip() item += line if item.endswith("}"): datasets.append(json.loads(item)) item = "" # モデルによるタスクの推論。 from tqdm import tqdm results = [] for data in tqdm(datasets): input = data["input"] prompt = f"""### 指示 {input} ### 回答 """ tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device) attention_mask = torch.ones_like(tokenized_input) with torch.no_grad(): outputs = model.generate( tokenized_input, attention_mask=attention_mask, max_new_tokens=100, do_sample=False, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id )[0] output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True) results.append({"task_id": data["task_id"], "input": input, "output": output}) # こちらで生成されたjsolを提出してください。 # 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。 # 必須なのはtask_idとoutputとなります。 import re jsonl_id = re.sub(".*/", "", new_model_id) with open(f"./{jsonl_id}-outputs-{formatted_date}.jsonl", 'w', encoding='utf-8') as f: for result in results: json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters f.write('\n') # モデルとトークナイザーをHugging Faceにアップロード model.push_to_hub(new_model_id, token=HF_TOKEN, private=True) # Online saving tokenizer.push_to_hub(new_model_id, token=HF_TOKEN, private=True) # Online saving ``` --- library_name: transformers --- # Model Card for Model ID ## Model Details ### Model Description This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. - **Developed by:** [More Information Needed] - **Funded by [optional]:** [More Information Needed] - **Shared by [optional]:** [More Information Needed] - **Model type:** [More Information Needed] - **Language(s) (NLP):** [More Information Needed] - **License:** [More Information Needed] - **Finetuned from model [optional]:** [More Information Needed] ### Model Sources [optional] - **Repository:** [More Information Needed] - **Paper [optional]:** [More Information Needed] - **Demo [optional]:** [More Information Needed] ## Uses ### Direct Use [More Information Needed] ### Downstream Use [optional] [More Information Needed] ### Out-of-Scope Use [More Information Needed] ## Bias, Risks, and Limitations [More Information Needed] ### Recommendations Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. ## How to Get Started with the Model Use the code below to get started with the model. [More Information Needed] ## Training Details ### Training Data [More Information Needed] ### Training Procedure #### Preprocessing [optional] [More Information Needed] #### Training Hyperparameters - **Training regime:** [More Information Needed] #### Speeds, Sizes, Times [optional] [More Information Needed] ## Evaluation ### Testing Data, Factors & Metrics #### Testing Data [More Information Needed] #### Factors [More Information Needed] #### Metrics [More Information Needed] ### Results [More Information Needed] #### Summary ## Model Examination [optional] [More Information Needed] ## Environmental Impact Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). - **Hardware Type:** [More Information Needed] - **Hours used:** [More Information Needed] - **Cloud Provider:** [More Information Needed] - **Compute Region:** [More Information Needed] - **Carbon Emitted:** [More Information Needed] ## Technical Specifications [optional] ### Model Architecture and Objective [More Information Needed] ### Compute Infrastructure [More Information Needed] #### Hardware [More Information Needed] #### Software [More Information Needed] ## Citation [optional] **BibTeX:** [More Information Needed] **APA:** [More Information Needed] ## Glossary [optional] [More Information Needed] ## More Information [optional] [More Information Needed] ## Model Card Authors [optional] [More Information Needed] ## Model Card Contact [More Information Needed]