Update README.md
Browse files
README.md
CHANGED
@@ -8,7 +8,10 @@ tags:
|
|
8 |
- trl
|
9 |
license: apache-2.0
|
10 |
language:
|
11 |
-
-
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
# Uploaded model
|
@@ -22,18 +25,21 @@ This gemma2 model was trained 2x faster with [Unsloth](https://github.com/unslot
|
|
22 |
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
|
23 |
|
24 |
訓練用データ
|
|
|
25 |
- https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0
|
|
|
26 |
- https://huggingface.co/datasets/Aratako/Magpie-Tanuki-8B-annotated-96k
|
27 |
-
|
28 |
-
|
29 |
|
30 |
実行コード
|
31 |
Google Colab用
|
32 |
-
|
33 |
*リンク先のGoogle Colabノートを実行してください
|
34 |
|
35 |
以下に同じノートを同じコードを掲載します
|
36 |
```python
|
|
|
37 |
!pip install -q transformers==4.46.3 accelerate bitsandbytes
|
38 |
!pip install -q tqdm
|
39 |
!pip install flash-attn --no-build-isolation
|
@@ -43,12 +49,17 @@ import torch
|
|
43 |
import json
|
44 |
from tqdm import tqdm
|
45 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
|
|
|
|
|
46 |
|
47 |
-
print("【重要】以下の手順でHugging Face
|
48 |
print("1. 左メニューの'シークレット'タブを開く")
|
49 |
print("2. '新しいシークレット'をクリック")
|
50 |
print("3. 名前に'HF_TOKEN'を入力")
|
51 |
print("4. 値にHugging Faceトークンを入力して保存")
|
|
|
|
|
52 |
|
53 |
# シークレットからHF_TOKENを取得
|
54 |
from google.colab import userdata
|
@@ -57,8 +68,6 @@ HF_TOKEN = userdata.get('HF_TOKEN')
|
|
57 |
if HF_TOKEN is None:
|
58 |
raise ValueError("HF_TOKENが設定されていません。上記の手順でトークンを設定してください。")
|
59 |
|
60 |
-
|
61 |
-
|
62 |
quantization_config = BitsAndBytesConfig(
|
63 |
load_in_4bit=True,
|
64 |
bnb_4bit_quant_type="nf4",
|
@@ -67,138 +76,183 @@ quantization_config = BitsAndBytesConfig(
|
|
67 |
)
|
68 |
|
69 |
def load_model_and_tokenizer():
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
)
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
model.config.use_cache = True
|
86 |
model.eval()
|
|
|
|
|
87 |
torch.cuda.empty_cache()
|
88 |
-
|
89 |
return model, tokenizer
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
}
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
with open(file_path, "r") as f:
|
98 |
-
for line in f:
|
99 |
-
if line.strip():
|
100 |
-
dt = json.loads(line)
|
101 |
-
tokenized_inputs.append({
|
102 |
-
"task_id": dt["task_id"],
|
103 |
-
"input": dt["input"]
|
104 |
-
})
|
105 |
-
return tokenized_inputs
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
for data in batch_data:
|
110 |
-
prompt = f"""<start_of_turn>system
|
111 |
簡潔に回答してください。装飾や特殊記号は使用しないでください。
|
112 |
<end_of_turn>
|
113 |
<start_of_turn>user
|
114 |
-
{
|
115 |
<end_of_turn>
|
116 |
<start_of_turn>model
|
117 |
-
"""
|
|
|
|
|
|
|
118 |
inputs = tokenizer(
|
119 |
-
|
120 |
-
|
121 |
-
padding="max_length",
|
122 |
truncation=True,
|
123 |
-
|
124 |
-
)
|
125 |
-
batch_inputs.append(inputs)
|
126 |
|
127 |
-
|
128 |
-
"input_ids": torch.cat([inp["input_ids"] for inp in batch_inputs], dim=0),
|
129 |
-
"attention_mask": torch.cat([inp["attention_mask"] for inp in batch_inputs], dim=0)
|
130 |
-
}
|
131 |
-
|
132 |
-
def post_process_output(response):
|
133 |
-
response = response.strip()
|
134 |
-
symbols_to_replace = ['**', '`', '|', '```', '---', '===']
|
135 |
-
for symbol in symbols_to_replace:
|
136 |
-
response = response.replace(symbol, ' ')
|
137 |
-
return ' '.join(response.split())
|
138 |
-
|
139 |
-
#メモリが足りない場合は、batch_sizeを下げてください(2など)
|
140 |
-
def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_size=4):
|
141 |
-
results = []
|
142 |
-
|
143 |
-
for i in tqdm(range(0, len(tokenized_inputs), batch_size)):
|
144 |
-
batch = tokenized_inputs[i:i+batch_size]
|
145 |
-
|
146 |
-
with torch.no_grad():
|
147 |
-
inputs = prepare_inputs(batch, tokenizer)
|
148 |
-
inputs = {k: v.to(model.device, non_blocking=True) for k, v in inputs.items()}
|
149 |
-
|
150 |
outputs = model.generate(
|
151 |
**inputs,
|
152 |
pad_token_id=tokenizer.pad_token_id,
|
153 |
eos_token_id=tokenizer.eos_token_id,
|
154 |
**generation_config
|
155 |
)
|
156 |
-
|
157 |
-
for idx,
|
158 |
-
|
159 |
-
|
160 |
-
if 'model\n' in
|
161 |
-
response =
|
162 |
-
elif 'model' in
|
163 |
-
response =
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
results.append({
|
170 |
-
"task_id":
|
171 |
-
"input":
|
172 |
-
"output":
|
173 |
})
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
178 |
return results
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
def save_results(results, output_dir):
|
181 |
os.makedirs(output_dir, exist_ok=True)
|
182 |
jsonl_path = os.path.join(output_dir, "Output.jsonl")
|
183 |
-
|
184 |
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
185 |
for item in results:
|
186 |
json.dump(item, f, ensure_ascii=False)
|
187 |
f.write('\n')
|
188 |
-
|
189 |
print(f"Saved results to: {jsonl_path}")
|
190 |
|
191 |
def main():
|
192 |
model, tokenizer = load_model_and_tokenizer()
|
193 |
-
|
194 |
-
# 入力データの読み込み
|
195 |
-
#Google colabのファイルにアップロードした際のpathにしてあります
|
196 |
-
#必要に応じてpathの修正をお願いします
|
197 |
tokenized_inputs = load_input_data("/content/elyza-tasks-100-TV_0.jsonl")
|
198 |
-
|
199 |
results = run_inference(model, tokenizer, tokenized_inputs, GENERATION_CONFIG)
|
200 |
save_results(results, "output")
|
201 |
|
202 |
if __name__ == "__main__":
|
203 |
main()
|
204 |
-
```
|
|
|
8 |
- trl
|
9 |
license: apache-2.0
|
10 |
language:
|
11 |
+
- ja
|
12 |
+
datasets:
|
13 |
+
- llm-jp/magpie-sft-v1.0
|
14 |
+
- Aratako/Magpie-Tanuki-8B-annotated-96k
|
15 |
---
|
16 |
|
17 |
# Uploaded model
|
|
|
25 |
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
|
26 |
|
27 |
訓練用データ
|
28 |
+
以下のデータでInstruction finetuningを実施した
|
29 |
- https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0
|
30 |
+
(Apache license 2.0)
|
31 |
- https://huggingface.co/datasets/Aratako/Magpie-Tanuki-8B-annotated-96k
|
32 |
+
(Apache license 2.0)
|
33 |
+
データをサンプリングして活用
|
34 |
|
35 |
実行コード
|
36 |
Google Colab用
|
37 |
+
リンク先:編集中
|
38 |
*リンク先のGoogle Colabノートを実行してください
|
39 |
|
40 |
以下に同じノートを同じコードを掲載します
|
41 |
```python
|
42 |
+
|
43 |
!pip install -q transformers==4.46.3 accelerate bitsandbytes
|
44 |
!pip install -q tqdm
|
45 |
!pip install flash-attn --no-build-isolation
|
|
|
49 |
import json
|
50 |
from tqdm import tqdm
|
51 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
52 |
+
from torch.cuda.amp import autocast
|
53 |
+
from concurrent.futures import ThreadPoolExecutor
|
54 |
+
import threading
|
55 |
|
56 |
+
print("【重要】以下の手順でHugging Faceトークンを設定しておいてください")
|
57 |
print("1. 左メニューの'シークレット'タブを開く")
|
58 |
print("2. '新しいシークレット'をクリック")
|
59 |
print("3. 名前に'HF_TOKEN'を入力")
|
60 |
print("4. 値にHugging Faceトークンを入力して保存")
|
61 |
+
print("ファイルタブ内にelyza-tasks-100-TV_0.jsonlを配置しておいてください")
|
62 |
+
print("出力物は、新規に作成されるOutputファイルの中に格納されます")
|
63 |
|
64 |
# シークレットからHF_TOKENを取得
|
65 |
from google.colab import userdata
|
|
|
68 |
if HF_TOKEN is None:
|
69 |
raise ValueError("HF_TOKENが設定されていません。上記の手順でトークンを設定してください。")
|
70 |
|
|
|
|
|
71 |
quantization_config = BitsAndBytesConfig(
|
72 |
load_in_4bit=True,
|
73 |
bnb_4bit_quant_type="nf4",
|
|
|
76 |
)
|
77 |
|
78 |
def load_model_and_tokenizer():
|
79 |
+
"""
|
80 |
+
モデルとトークナイザーを並列でダウンロードし、チェックポイントをロードする
|
81 |
+
"""
|
82 |
+
model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631" # あなたのモデルのパス
|
83 |
+
base_model_id = "google/gemma-2-9b"
|
84 |
+
downloaded_components = {"model": None, "tokenizer": None}
|
85 |
+
download_lock = threading.Lock()
|
86 |
+
|
87 |
+
def download_base_model():
|
88 |
+
quantization_config = BitsAndBytesConfig(
|
89 |
+
load_in_4bit=True,
|
90 |
+
bnb_4bit_quant_type="nf4",
|
91 |
+
bnb_4bit_use_double_quant=True,
|
92 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
93 |
+
)
|
94 |
+
|
95 |
+
model = AutoModelForCausalLM.from_pretrained(
|
96 |
+
base_model_id,
|
97 |
+
quantization_config=quantization_config,
|
98 |
+
device_map="auto",
|
99 |
+
trust_remote_code=True,
|
100 |
+
torch_dtype=torch.bfloat16,
|
101 |
+
attn_implementation="eager",
|
102 |
+
low_cpu_mem_usage=True,
|
103 |
+
token=HF_TOKEN # トークンを追加
|
104 |
+
)
|
105 |
+
with download_lock:
|
106 |
+
downloaded_components["model"] = model
|
107 |
+
|
108 |
+
def download_tokenizer():
|
109 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
110 |
+
model_id, # 学習済みモデルのトークナイザーを使用
|
111 |
+
trust_remote_code=True,
|
112 |
+
token=HF_TOKEN # トークンを追加
|
113 |
+
)
|
114 |
+
with download_lock:
|
115 |
+
downloaded_components["tokenizer"] = tokenizer
|
116 |
+
|
117 |
+
# GPUキャッシュをクリア
|
118 |
+
torch.cuda.empty_cache()
|
119 |
+
|
120 |
+
# ThreadPoolExecutorを使用して並列ダウンロードを実行
|
121 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
122 |
+
model_future = executor.submit(download_base_model)
|
123 |
+
tokenizer_future = executor.submit(download_tokenizer)
|
124 |
+
|
125 |
+
# 両方のダウンロードが完了するまで待機
|
126 |
+
model_future.result()
|
127 |
+
tokenizer_future.result()
|
128 |
+
|
129 |
+
model = downloaded_components["model"]
|
130 |
+
tokenizer = downloaded_components["tokenizer"]
|
131 |
+
|
132 |
+
# GPUキャッシュをクリア(チェックポイントロード前)
|
133 |
+
torch.cuda.empty_cache()
|
134 |
+
|
135 |
+
# チェックポイントのロード
|
136 |
+
try:
|
137 |
+
adapter_path = model_id
|
138 |
+
print(f"Loading adapter from {adapter_path}")
|
139 |
+
model.load_adapter(adapter_path, "default", token=HF_TOKEN) # トークンを追加
|
140 |
+
print("Adapter loaded successfully")
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Error loading adapter: {e}")
|
143 |
+
raise
|
144 |
+
|
145 |
+
# 最終設定
|
146 |
model.config.use_cache = True
|
147 |
model.eval()
|
148 |
+
|
149 |
+
# 最終的なGPUキャッシュのクリア
|
150 |
torch.cuda.empty_cache()
|
151 |
+
|
152 |
return model, tokenizer
|
153 |
|
154 |
+
def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_size=4):
|
155 |
+
results = []
|
|
|
156 |
|
157 |
+
for i in tqdm(range(0, len(tokenized_inputs), batch_size)):
|
158 |
+
batch = tokenized_inputs[i:i+batch_size]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
prompts = [
|
161 |
+
f"""<start_of_turn>system
|
|
|
|
|
162 |
簡潔に回答してください。装飾や特殊記号は使用しないでください。
|
163 |
<end_of_turn>
|
164 |
<start_of_turn>user
|
165 |
+
{item["input"]}
|
166 |
<end_of_turn>
|
167 |
<start_of_turn>model
|
168 |
+
""" for item in batch
|
169 |
+
]
|
170 |
+
|
171 |
+
# 動的パディングを使用
|
172 |
inputs = tokenizer(
|
173 |
+
prompts,
|
174 |
+
padding=True, # 動的パディング
|
|
|
175 |
truncation=True,
|
176 |
+
return_tensors="pt"
|
177 |
+
).to(model.device)
|
|
|
178 |
|
179 |
+
with torch.no_grad(), autocast(dtype=torch.bfloat16):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
outputs = model.generate(
|
181 |
**inputs,
|
182 |
pad_token_id=tokenizer.pad_token_id,
|
183 |
eos_token_id=tokenizer.eos_token_id,
|
184 |
**generation_config
|
185 |
)
|
186 |
+
|
187 |
+
for idx, output in enumerate(outputs):
|
188 |
+
response = tokenizer.decode(output, skip_special_tokens=True)
|
189 |
+
|
190 |
+
if 'model\n' in response:
|
191 |
+
response = response.split('model\n')[-1].strip()
|
192 |
+
elif 'model' in response:
|
193 |
+
response = response.split('model')[-1].strip()
|
194 |
+
|
195 |
+
# 後処理を追加
|
196 |
+
response = post_process_output(response)
|
197 |
+
|
|
|
198 |
results.append({
|
199 |
+
"task_id": batch[idx]["task_id"],
|
200 |
+
"input": batch[idx]["input"],
|
201 |
+
"output": response
|
202 |
})
|
203 |
+
|
204 |
+
# バッチ処理後のメモリ解放
|
205 |
+
del outputs, inputs
|
206 |
+
torch.cuda.empty_cache()
|
207 |
+
|
208 |
return results
|
209 |
|
210 |
+
def post_process_output(response):
|
211 |
+
response = response.strip()
|
212 |
+
symbols_to_replace = ['**', '`', '|', '```', '---', '===']
|
213 |
+
for symbol in symbols_to_replace:
|
214 |
+
response = response.replace(symbol, ' ')
|
215 |
+
return ' '.join(response.split())
|
216 |
+
|
217 |
+
GENERATION_CONFIG = {
|
218 |
+
"max_new_tokens": 512,
|
219 |
+
"use_cache": True,
|
220 |
+
"do_sample": False,
|
221 |
+
"num_beams": 4,
|
222 |
+
"repetition_penalty": 1.2,
|
223 |
+
"length_penalty": 1.0,
|
224 |
+
"early_stopping": False
|
225 |
+
}
|
226 |
+
|
227 |
+
def load_input_data(file_path):
|
228 |
+
tokenized_inputs = []
|
229 |
+
with open(file_path, "r") as f:
|
230 |
+
for line in f:
|
231 |
+
if line.strip():
|
232 |
+
dt = json.loads(line)
|
233 |
+
tokenized_inputs.append({
|
234 |
+
"task_id": dt["task_id"],
|
235 |
+
"input": dt["input"]
|
236 |
+
})
|
237 |
+
return tokenized_inputs
|
238 |
+
|
239 |
def save_results(results, output_dir):
|
240 |
os.makedirs(output_dir, exist_ok=True)
|
241 |
jsonl_path = os.path.join(output_dir, "Output.jsonl")
|
242 |
+
|
243 |
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
244 |
for item in results:
|
245 |
json.dump(item, f, ensure_ascii=False)
|
246 |
f.write('\n')
|
247 |
+
|
248 |
print(f"Saved results to: {jsonl_path}")
|
249 |
|
250 |
def main():
|
251 |
model, tokenizer = load_model_and_tokenizer()
|
|
|
|
|
|
|
|
|
252 |
tokenized_inputs = load_input_data("/content/elyza-tasks-100-TV_0.jsonl")
|
|
|
253 |
results = run_inference(model, tokenizer, tokenized_inputs, GENERATION_CONFIG)
|
254 |
save_results(results, "output")
|
255 |
|
256 |
if __name__ == "__main__":
|
257 |
main()
|
258 |
+
```
|