Chrom256
/

gemma-2-9b-it-lora_20241216_033631

@@ -54,7 +54,6 @@ L4でのコード実行時間は全体で約45分でした。
 以下にGoogle Colabノートブックと同じコードを掲載します
 ```python
 !pip install -q transformers==4.46.3 accelerate bitsandbytes
 !pip install -q tqdm
 !pip install flash-attn --no-build-isolation
@@ -74,9 +73,8 @@ print("2. '新しいシークレット'をクリック")
 print("3. 名前に'HF_TOKEN'を入力")
 print("4. 値にHugging Faceトークンを入力して保存")
 print("ファイルタブ内にelyza-tasks-100-TV_0.jsonlを配置しておいてください")
-print("出力物は、新規に作成されるOutputファイルの中に格納されます")
-# シークレットからHF_TOKENを取得
 from google.colab import userdata
 HF_TOKEN = userdata.get('HF_TOKEN')
@@ -91,6 +89,9 @@ quantization_config = BitsAndBytesConfig(
 )
 def load_model_and_tokenizer():
     model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631"
     base_model_id = "google/gemma-2-9b"
     downloaded_components = {"model": None, "tokenizer": None}
@@ -126,21 +127,25 @@ def load_model_and_tokenizer():
         with download_lock:
             downloaded_components["tokenizer"] = tokenizer
     torch.cuda.empty_cache()
-    # ThreadPoolExecutorを使用して並列ダウンロード
     with ThreadPoolExecutor(max_workers=2) as executor:
         model_future = executor.submit(download_base_model)
         tokenizer_future = executor.submit(download_tokenizer)
         model_future.result()
         tokenizer_future.result()
     model = downloaded_components["model"]
     tokenizer = downloaded_components["tokenizer"]
     torch.cuda.empty_cache()
     try:
         adapter_path = model_id
         print(f"Loading adapter from {adapter_path}")
@@ -150,9 +155,11 @@ def load_model_and_tokenizer():
         print(f"Error loading adapter: {e}")
         raise
     model.config.use_cache = True
     model.eval()
     torch.cuda.empty_cache()
     return model, tokenizer
@@ -174,6 +181,7 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
 """ for item in batch
         ]
         inputs = tokenizer(
             prompts,
             padding=True,
@@ -197,6 +205,7 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
                 elif 'model' in response:
                     response = response.split('model')[-1].strip()
                 response = post_process_output(response)
                 results.append({
@@ -205,6 +214,7 @@ def run_inference(model, tokenizer, tokenized_inputs, generation_config, batch_s
                     "output": response
                 })
         del outputs, inputs
         torch.cuda.empty_cache()

 以下にGoogle Colabノートブックと同じコードを掲載します
 ```python
 !pip install -q transformers==4.46.3 accelerate bitsandbytes
 !pip install -q tqdm
 !pip install flash-attn --no-build-isolation
 print("3. 名前に'HF_TOKEN'を入力")
 print("4. 値にHugging Faceトークンを入力して保存")
 print("ファイルタブ内にelyza-tasks-100-TV_0.jsonlを配置しておいてください")
+print("出力物は、新規に作成されるoutputファイルの中に格納されます")
 from google.colab import userdata
 HF_TOKEN = userdata.get('HF_TOKEN')
 )
 def load_model_and_tokenizer():
+    """
+    モデルとトークナイザーを並列でダウンロードし、チェックポイントをロードする
+    """
     model_id = "Chrom256/gemma-2-9b-it-lora_20241216_033631"
     base_model_id = "google/gemma-2-9b"
     downloaded_components = {"model": None, "tokenizer": None}
         with download_lock:
             downloaded_components["tokenizer"] = tokenizer
     torch.cuda.empty_cache()
     with ThreadPoolExecutor(max_workers=2) as executor:
         model_future = executor.submit(download_base_model)
         tokenizer_future = executor.submit(download_tokenizer)
         model_future.result()
         tokenizer_future.result()
     model = downloaded_components["model"]
     tokenizer = downloaded_components["tokenizer"]
     torch.cuda.empty_cache()
     try:
         adapter_path = model_id
         print(f"Loading adapter from {adapter_path}")
         print(f"Error loading adapter: {e}")
         raise
     model.config.use_cache = True
     model.eval()
     torch.cuda.empty_cache()
     return model, tokenizer
 """ for item in batch
         ]
         inputs = tokenizer(
             prompts,
             padding=True,
                 elif 'model' in response:
                     response = response.split('model')[-1].strip()
                 response = post_process_output(response)
                 results.append({
                     "output": response
                 })
         del outputs, inputs
         torch.cuda.empty_cache()