Chrom256 commited on
Commit
b81b094
1 Parent(s): 1f2c4d1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +182 -0
README.md CHANGED
@@ -20,3 +20,185 @@ language:
20
  This gemma2 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  This gemma2 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
23
+
24
+ 訓練用データ
25
+ - https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0
26
+ - https://huggingface.co/datasets/Aratako/Magpie-Tanuki-8B-annotated-96k
27
+ 共にapache-2.0
28
+ 編集中
29
+
30
+ 実行コード
31
+ Google Colab用
32
+ リンク先:
33
+ *リンク先のGoogle Colabノートを実行してください
34
+
35
+ 以下に同じノートを同じコードを掲載します
36
+ ```python
37
+ !pip install -q transformers==4.46.3 accelerate bitsandbytes
38
+ !pip install -q tqdm
39
+ !pip install flash-attn --no-build-isolation
40
+
41
+ import os
42
+ import torch
43
+ import json
44
+ from tqdm import tqdm
45
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
46
+
47
+ print("【重要】以下の手順でHugging Faceトークンを設定してください:")
48
+ print("1. 左メニューの'シークレット'タブを開く")
49
+ print("2. '新しいシークレット'をクリック")
50
+ print("3. 名前に'HF_TOKEN'を入力")
51
+ print("4. 値にHugging Faceトークンを入力して保存")
52
+
53
+ # シークレットからHF_TOKENを取得
54
+ from google.colab import userdata
55
+ HF_TOKEN = userdata.get('HF_TOKEN')
56
+
57
+ if HF_TOKEN is None:
58
+ raise ValueError("HF_TOKENが設定されていません。上記の手順でトークンを設定してください。")
59
+
60
+
61
+
62
+ quantization_config = BitsAndBytesConfig(
63
+ load_in_4bit=True,
64
+ bnb_4bit_quant_type="nf4",
65
+ bnb_4bit_use_double_quant=True,
66
+ bnb_4bit_compute_dtype=torch.bfloat16
67
+ )
68
+
69
+ def load_model_and_tokenizer():
70
+ model = AutoModelForCausalLM.from_pretrained(
71
+ "Chrom256/gemma-2-9b-it-lora_20241216_033631",
72
+ quantization_config=quantization_config,
73
+ device_map="auto",
74
+ trust_remote_code=True,
75
+ torch_dtype=torch.bfloat16,
76
+ attn_implementation="eager",
77
+ low_cpu_mem_usage=True
78
+ )
79
+
80
+ tokenizer = AutoTokenizer.from_pretrained(
81
+ "Chrom256/gemma-2-9b-it-lora_20241216_033631",
82
+ trust_remote_code=True
83
+ )
84
+
85
+ model.config.use_cache = True
86
+ model.eval()
87
+ torch.cuda.empty_cache()
88
+
89
+ return model, tokenizer
90
+
91
+ GENERATION_CONFIG = {
92
+ !編集中!
93
+ }
94
+
95
+ def load_input_data(file_path):
96
+ tokenized_inputs = []
97
+ with open(file_path, "r") as f:
98
+ for line in f:
99
+ if line.strip():
100
+ dt = json.loads(line)
101
+ tokenized_inputs.append({
102
+ "task_id": dt["task_id"],
103
+ "input": dt["input"]
104
+ })
105
+ return tokenized_inputs
106
+
107
+ def prepare_inputs(batch_data, tokenizer, max_length=1024):
108
+ batch_inputs = []
109
+ for data in batch_data:
110
+ prompt = f"""<start_of_turn>system
111
+ 簡潔に回答してください。装飾や特殊記号は使用しないでください。
112
+ <end_of_turn>
113
+ <start_of_turn>user
114
+ {data["input"]}
115
+ <end_of_turn>
116
+ <start_of_turn>model
117
+ """
118
+ inputs = tokenizer(
119
+ prompt,
120
+ return_tensors="pt",
121
+ padding="max_length",
122
+ truncation=True,
123
+ max_length=max_length
124
+ )
125
+ batch_inputs.append(inputs)
126
+
127
+ return {
128
+ "input_ids": torch.cat([inp["input_ids"] for inp in batch_inputs], dim=0),
129
+ "attention_mask": torch.cat([inp["attention_mask"] for inp in batch_inputs], dim=0)
130
+ }
131
+
132
+ def post_process_output(response):
133
+ response = response.strip()
134
+ symbols_to_replace = ['**', '`', '|', '```', '---', '===']
135
+ for symbol in symbols_to_replace:
136
+ response = response.replace(symbol, ' ')
137
+ return ' '.join(response.split())
138
+
139
+ #メモリが足りない場合は、batch_sizeを下げてください(2など)
140
+ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_size=4):
141
+ results = []
142
+
143
+ for i in tqdm(range(0, len(tokenized_inputs), batch_size)):
144
+ batch = tokenized_inputs[i:i+batch_size]
145
+
146
+ with torch.no_grad():
147
+ inputs = prepare_inputs(batch, tokenizer)
148
+ inputs = {k: v.to(model.device, non_blocking=True) for k, v in inputs.items()}
149
+
150
+ outputs = model.generate(
151
+ **inputs,
152
+ pad_token_id=tokenizer.pad_token_id,
153
+ eos_token_id=tokenizer.eos_token_id,
154
+ **generation_config
155
+ )
156
+
157
+ for idx, b in enumerate(batch):
158
+ full_output = tokenizer.decode(outputs[idx], skip_special_tokens=True)
159
+
160
+ if 'model\n' in full_output:
161
+ response = full_output.split('model\n')[-1].strip()
162
+ elif 'model' in full_output:
163
+ response = full_output.split('model')[-1].strip()
164
+ else:
165
+ response = full_output.strip()
166
+
167
+ processed_response = post_process_output(response)
168
+
169
+ results.append({
170
+ "task_id": b["task_id"],
171
+ "input": b["input"],
172
+ "output": processed_response,
173
+ })
174
+
175
+ del outputs
176
+ torch.cuda.empty_cache()
177
+
178
+ return results
179
+
180
+ def save_results(results, output_dir):
181
+ os.makedirs(output_dir, exist_ok=True)
182
+ jsonl_path = os.path.join(output_dir, "Output.jsonl")
183
+
184
+ with open(jsonl_path, 'w', encoding='utf-8') as f:
185
+ for item in results:
186
+ json.dump(item, f, ensure_ascii=False)
187
+ f.write('\n')
188
+
189
+ print(f"Saved results to: {jsonl_path}")
190
+
191
+ def main():
192
+ model, tokenizer = load_model_and_tokenizer()
193
+
194
+ # 入力データの読み込み
195
+ #Google colabのファイルにアップロードした際のpathにしてあります
196
+ #必要に応じてpathの修正をお願いします
197
+ tokenized_inputs = load_input_data("/content/elyza-tasks-100-TV_0.jsonl")
198
+
199
+ results = run_inference(model, tokenizer, tokenized_inputs, GENERATION_CONFIG)
200
+ save_results(results, "output")
201
+
202
+ if __name__ == "__main__":
203
+ main()
204
+ ```