Chrom256
/

gemma-2-9b-it-lora_20241216_033631

@@ -76,10 +76,7 @@ quantization_config = BitsAndBytesConfig(
 )
 def load_model_and_tokenizer():
-    """
-    モデルとトークナイザーを並列でダウンロードし、チェックポイントをロードする
-    """
-    model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631"  # あなたのモデルのパス
     base_model_id = "google/gemma-2-9b"
     downloaded_components = {"model": None, "tokenizer": None}
     download_lock = threading.Lock()
@@ -100,53 +97,47 @@ def load_model_and_tokenizer():
             torch_dtype=torch.bfloat16,
             attn_implementation="eager",
             low_cpu_mem_usage=True,
-            token=HF_TOKEN  # トークンを追加
         )
         with download_lock:
             downloaded_components["model"] = model
     def download_tokenizer():
         tokenizer = AutoTokenizer.from_pretrained(
-            model_id,  # 学習済みモデルのトークナイザーを使用
             trust_remote_code=True,
-            token=HF_TOKEN  # トークンを追加
         )
         with download_lock:
             downloaded_components["tokenizer"] = tokenizer
-    # GPUキャッシュをクリア
     torch.cuda.empty_cache()
-    # ThreadPoolExecutorを使用して並列ダウンロードを実行
     with ThreadPoolExecutor(max_workers=2) as executor:
         model_future = executor.submit(download_base_model)
         tokenizer_future = executor.submit(download_tokenizer)
-        # 両方のダウンロードが完了するまで待機
         model_future.result()
         tokenizer_future.result()
     model = downloaded_components["model"]
     tokenizer = downloaded_components["tokenizer"]
-    # GPUキャッシュをクリア（チェックポイントロード前）
     torch.cuda.empty_cache()
-    # チェックポイントのロード
     try:
         adapter_path = model_id
         print(f"Loading adapter from {adapter_path}")
-        model.load_adapter(adapter_path, "default", token=HF_TOKEN)  # トークンを追加
         print("Adapter loaded successfully")
     except Exception as e:
         print(f"Error loading adapter: {e}")
         raise
-    # 最終設定
     model.config.use_cache = True
     model.eval()
-    # 最終的なGPUキャッシュのクリア
     torch.cuda.empty_cache()
     return model, tokenizer
@@ -168,10 +159,9 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
 """ for item in batch
         ]
-        # 動的パディングを使用
         inputs = tokenizer(
             prompts,
-            padding=True,  # 動的パディング
             truncation=True,
             return_tensors="pt"
         ).to(model.device)
@@ -192,7 +182,6 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
                 elif 'model' in response:
                     response = response.split('model')[-1].strip()
-                # 後処理を追加
                 response = post_process_output(response)
                 results.append({
@@ -201,7 +190,6 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
                     "output": response
                 })
-        # バッチ処理後のメモリ解放
         del outputs, inputs
         torch.cuda.empty_cache()

 )
 def load_model_and_tokenizer():
+    model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631"
     base_model_id = "google/gemma-2-9b"
     downloaded_components = {"model": None, "tokenizer": None}
     download_lock = threading.Lock()
             torch_dtype=torch.bfloat16,
             attn_implementation="eager",
             low_cpu_mem_usage=True,
+            token=HF_TOKEN
         )
         with download_lock:
             downloaded_components["model"] = model
     def download_tokenizer():
         tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
             trust_remote_code=True,
+            token=HF_TOKEN
         )
         with download_lock:
             downloaded_components["tokenizer"] = tokenizer
     torch.cuda.empty_cache()
+    # ThreadPoolExecutorを使用して並列ダウンロード
     with ThreadPoolExecutor(max_workers=2) as executor:
         model_future = executor.submit(download_base_model)
         tokenizer_future = executor.submit(download_tokenizer)
         model_future.result()
         tokenizer_future.result()
     model = downloaded_components["model"]
     tokenizer = downloaded_components["tokenizer"]
     torch.cuda.empty_cache()
     try:
         adapter_path = model_id
         print(f"Loading adapter from {adapter_path}")
+        model.load_adapter(adapter_path, "default", token=HF_TOKEN)
         print("Adapter loaded successfully")
     except Exception as e:
         print(f"Error loading adapter: {e}")
         raise
     model.config.use_cache = True
     model.eval()
     torch.cuda.empty_cache()
     return model, tokenizer
 """ for item in batch
         ]
         inputs = tokenizer(
             prompts,
+            padding=True,
             truncation=True,
             return_tensors="pt"
         ).to(model.device)
                 elif 'model' in response:
                     response = response.split('model')[-1].strip()
                 response = post_process_output(response)
                 results.append({
                     "output": response
                 })
         del outputs, inputs
         torch.cuda.empty_cache()