Chrom256
/

gemma-2-9b-it-lora_20241216_033631

@@ -8,7 +8,10 @@ tags:
 - trl
 license: apache-2.0
 language:
-- en
 ---
 # Uploaded  model
@@ -22,18 +25,21 @@ This gemma2 model was trained 2x faster with [Unsloth](https://github.com/unslot
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 訓練用データ
 - https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0
 - https://huggingface.co/datasets/Aratako/Magpie-Tanuki-8B-annotated-96k
-共にapache-2.0
-編集中
 実行コード
 Google Colab用
-リンク先：
 ＊リンク先のGoogle Colabノートを実行してください
 以下に同じノートを同じコードを掲載します
 ```python
 !pip install -q transformers==4.46.3 accelerate bitsandbytes
 !pip install -q tqdm
 !pip install flash-attn --no-build-isolation
@@ -43,12 +49,17 @@ import torch
 import json
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-print("【重要】以下の手順でHugging Faceトークンを設定してください：")
 print("1. 左メニューの'シークレット'タブを開く")
 print("2. '新しいシークレット'をクリック")
 print("3. 名前に'HF_TOKEN'を入力")
 print("4. 値にHugging Faceトークンを入力して保存")
 # シークレットからHF_TOKENを取得
 from google.colab import userdata
@@ -57,8 +68,6 @@ HF_TOKEN = userdata.get('HF_TOKEN')
 if HF_TOKEN is None:
     raise ValueError("HF_TOKENが設定されていません。上記の手順でトークンを設定してください。")
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -67,138 +76,183 @@ quantization_config = BitsAndBytesConfig(
 )
 def load_model_and_tokenizer():
-    model = AutoModelForCausalLM.from_pretrained(
-        "Chrom256/gemma-2-9b-it-lora_20241216_033631",
-        quantization_config=quantization_config,
-        device_map="auto",
-        trust_remote_code=True,
-        torch_dtype=torch.bfloat16,
-        attn_implementation="eager",
-        low_cpu_mem_usage=True
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        "Chrom256/gemma-2-9b-it-lora_20241216_033631",
-        trust_remote_code=True
-    )
     model.config.use_cache = True
     model.eval()
     torch.cuda.empty_cache()
     return model, tokenizer
-GENERATION_CONFIG = {
-！編集中！
-}
-def load_input_data(file_path):
-    tokenized_inputs = []
-    with open(file_path, "r") as f:
-        for line in f:
-            if line.strip():
-                dt = json.loads(line)
-                tokenized_inputs.append({
-                    "task_id": dt["task_id"],
-                    "input": dt["input"]
-                })
-    return tokenized_inputs
-def prepare_inputs(batch_data, tokenizer, max_length=1024):
-    batch_inputs = []
-    for data in batch_data:
-        prompt = f"""<start_of_turn>system
 簡潔に回答してください。装飾や特殊記号は使用しないでください。
 <end_of_turn>
 <start_of_turn>user
-{data["input"]}
 <end_of_turn>
 <start_of_turn>model
-"""
         inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            padding="max_length",
             truncation=True,
-            max_length=max_length
-        )
-        batch_inputs.append(inputs)
-    return {
-        "input_ids": torch.cat([inp["input_ids"] for inp in batch_inputs], dim=0),
-        "attention_mask": torch.cat([inp["attention_mask"] for inp in batch_inputs], dim=0)
-    }
-def post_process_output(response):
-    response = response.strip()
-    symbols_to_replace = ['**', '`', '|', '```', '---', '===']
-    for symbol in symbols_to_replace:
-        response = response.replace(symbol, ' ')
-    return ' '.join(response.split())
-#メモリが足りない場合は、batch_sizeを下げてください(2など)
-def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_size=4):
-    results = []
-    for i in tqdm(range(0, len(tokenized_inputs), batch_size)):
-        batch = tokenized_inputs[i:i+batch_size]
-        with torch.no_grad():
-            inputs = prepare_inputs(batch, tokenizer)
-            inputs = {k: v.to(model.device, non_blocking=True) for k, v in inputs.items()}
             outputs = model.generate(
                 **inputs,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 **generation_config
             )
-            for idx, b in enumerate(batch):
-                full_output = tokenizer.decode(outputs[idx], skip_special_tokens=True)
-                if 'model\n' in full_output:
-                    response = full_output.split('model\n')[-1].strip()
-                elif 'model' in full_output:
-                    response = full_output.split('model')[-1].strip()
-                else:
-                    response = full_output.strip()
-                processed_response = post_process_output(response)
                 results.append({
-                    "task_id": b["task_id"],
-                    "input": b["input"],
-                    "output": processed_response,
                 })
-            del outputs
-            torch.cuda.empty_cache()
     return results
 def save_results(results, output_dir):
     os.makedirs(output_dir, exist_ok=True)
     jsonl_path = os.path.join(output_dir, "Output.jsonl")
     with open(jsonl_path, 'w', encoding='utf-8') as f:
         for item in results:
             json.dump(item, f, ensure_ascii=False)
             f.write('\n')
     print(f"Saved results to: {jsonl_path}")
 def main():
     model, tokenizer = load_model_and_tokenizer()
-    # 入力データの読み込み
-    #Google colabのファイルにアップロードした際のpathにしてあります
-    #必要に応じてpathの修正をお願いします
     tokenized_inputs = load_input_data("/content/elyza-tasks-100-TV_0.jsonl")
     results = run_inference(model, tokenizer, tokenized_inputs, GENERATION_CONFIG)
     save_results(results, "output")
 if __name__ == "__main__":
     main()
-```

 - trl
 license: apache-2.0
 language:
+- ja
+datasets:
+- llm-jp/magpie-sft-v1.0
+- Aratako/Magpie-Tanuki-8B-annotated-96k
 ---
 # Uploaded  model
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 訓練用データ
+以下のデータでInstruction finetuningを実施した
 - https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0
+(Apache license 2.0)
 - https://huggingface.co/datasets/Aratako/Magpie-Tanuki-8B-annotated-96k
+(Apache license 2.0)
+データをサンプリングして活用
 実行コード
 Google Colab用
+リンク先：編集中
 ＊リンク先のGoogle Colabノートを実行してください
 以下に同じノートを同じコードを掲載します
 ```python
 !pip install -q transformers==4.46.3 accelerate bitsandbytes
 !pip install -q tqdm
 !pip install flash-attn --no-build-isolation
 import json
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from torch.cuda.amp import autocast
+from concurrent.futures import ThreadPoolExecutor
+import threading
+print("【重要】以下の手順でHugging Faceトークンを設定しておいてください")
 print("1. 左メニューの'シークレット'タブを開く")
 print("2. '新しいシークレット'をクリック")
 print("3. 名前に'HF_TOKEN'を入力")
 print("4. 値にHugging Faceトークンを入力して保存")
+print("ファイルタブ内にelyza-tasks-100-TV_0.jsonlを配置しておいてください")
+print("出力物は、新規に作成されるOutputファイルの中に格納されます")
 # シークレットからHF_TOKENを取得
 from google.colab import userdata
 if HF_TOKEN is None:
     raise ValueError("HF_TOKENが設定されていません。上記の手順でトークンを設定してください。")
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
 )
 def load_model_and_tokenizer():
+    """
+    モデルとトークナイザーを並列でダウンロードし、チェックポイントをロードする
+    """
+    model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631"  # あなたのモデルのパス
+    base_model_id = "google/gemma-2-9b"
+    downloaded_components = {"model": None, "tokenizer": None}
+    download_lock = threading.Lock()
+    def download_base_model():
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_id,
+            quantization_config=quantization_config,
+            device_map="auto",
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="eager",
+            low_cpu_mem_usage=True,
+            token=HF_TOKEN  # トークンを追加
+        )
+        with download_lock:
+            downloaded_components["model"] = model
+    def download_tokenizer():
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,  # 学習済みモデルのトークナイザーを使用
+            trust_remote_code=True,
+            token=HF_TOKEN  # トークンを追加
+        )
+        with download_lock:
+            downloaded_components["tokenizer"] = tokenizer
+    # GPUキャッシュをクリア
+    torch.cuda.empty_cache()
+    # ThreadPoolExecutorを使用して並列ダウンロードを実行
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        model_future = executor.submit(download_base_model)
+        tokenizer_future = executor.submit(download_tokenizer)
+        # 両方のダウンロードが完了するまで待機
+        model_future.result()
+        tokenizer_future.result()
+    model = downloaded_components["model"]
+    tokenizer = downloaded_components["tokenizer"]
+    # GPUキャッシュをクリア（チェックポイントロード前）
+    torch.cuda.empty_cache()
+    # チェックポイントのロード
+    try:
+        adapter_path = model_id
+        print(f"Loading adapter from {adapter_path}")
+        model.load_adapter(adapter_path, "default", token=HF_TOKEN)  # トークンを追加
+        print("Adapter loaded successfully")
+    except Exception as e:
+        print(f"Error loading adapter: {e}")
+        raise
+    # 最終設定
     model.config.use_cache = True
     model.eval()
+    # 最終的なGPUキャッシュのクリア
     torch.cuda.empty_cache()
     return model, tokenizer
+def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_size=4):
+    results = []
+    for i in tqdm(range(0, len(tokenized_inputs), batch_size)):
+        batch = tokenized_inputs[i:i+batch_size]
+        prompts = [
+            f"""<start_of_turn>system
 簡潔に回答してください。装飾や特殊記号は使用しないでください。
 <end_of_turn>
 <start_of_turn>user
+{item["input"]}
 <end_of_turn>
 <start_of_turn>model
+""" for item in batch
+        ]
+        # 動的パディングを使用
         inputs = tokenizer(
+            prompts,
+            padding=True,  # 動的パディング
             truncation=True,
+            return_tensors="pt"
+        ).to(model.device)
+        with torch.no_grad(), autocast(dtype=torch.bfloat16):
             outputs = model.generate(
                 **inputs,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 **generation_config
             )
+            for idx, output in enumerate(outputs):
+                response = tokenizer.decode(output, skip_special_tokens=True)
+                if 'model\n' in response:
+                    response = response.split('model\n')[-1].strip()
+                elif 'model' in response:
+                    response = response.split('model')[-1].strip()
+                # 後処理を追加
+                response = post_process_output(response)
                 results.append({
+                    "task_id": batch[idx]["task_id"],
+                    "input": batch[idx]["input"],
+                    "output": response
                 })
+        # バッチ処理後のメモリ解放
+        del outputs, inputs
+        torch.cuda.empty_cache()
     return results
+def post_process_output(response):
+    response = response.strip()
+    symbols_to_replace = ['**', '`', '|', '```', '---', '===']
+    for symbol in symbols_to_replace:
+        response = response.replace(symbol, ' ')
+    return ' '.join(response.split())
+GENERATION_CONFIG = {
+    "max_new_tokens": 512,
+    "use_cache": True,
+    "do_sample": False,
+    "num_beams": 4,
+    "repetition_penalty": 1.2,
+    "length_penalty": 1.0,
+    "early_stopping": False
+}
+def load_input_data(file_path):
+    tokenized_inputs = []
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip():
+                dt = json.loads(line)
+                tokenized_inputs.append({
+                    "task_id": dt["task_id"],
+                    "input": dt["input"]
+                })
+    return tokenized_inputs
 def save_results(results, output_dir):
     os.makedirs(output_dir, exist_ok=True)
     jsonl_path = os.path.join(output_dir, "Output.jsonl")
     with open(jsonl_path, 'w', encoding='utf-8') as f:
         for item in results:
             json.dump(item, f, ensure_ascii=False)
             f.write('\n')
     print(f"Saved results to: {jsonl_path}")
 def main():
     model, tokenizer = load_model_and_tokenizer()
     tokenized_inputs = load_input_data("/content/elyza-tasks-100-TV_0.jsonl")
     results = run_inference(model, tokenizer, tokenized_inputs, GENERATION_CONFIG)
     save_results(results, "output")
 if __name__ == "__main__":
     main()
+```