eunseong
/

care_qwen_pt

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "args_ours": {
+    "RQVAE_path": null,
+    "alpha_codebook": null,
+    "alpha_contrastive": 0.0,
+    "alpha_kl": null,
+    "alpha_masked_kl": 0.0,
+    "alpha_multi_task": 0.0,
+    "alpha_nll": 1.0,
+    "alpha_ortho": null,
+    "alpha_paraphrase": 0.0,
+    "alpha_rqvae": null,
+    "answers_path": null,
+    "assist_content_kl": "content",
+    "assist_content_nll": "content",
+    "base_model": null,
+    "built_in_rqvae": null,
+    "chat_format": "icae",
+    "checkpoint_path": null,
+    "checkpointing_steps": 1000,
+    "clip_grad_norm": -1.0,
+    "codebook_force": null,
+    "completed_steps": null,
+    "config": "config/language_modeling/pretrain_icae_qwen.yaml",
+    "ctx_change": "gt",
+    "ctx_change_ratio": 2.0,
+    "ctx_change_shuffle_ratio": 2.0,
+    "ctx_kl_weight": null,
+    "ctx_kl_weight_group": [],
+    "ctx_nll": "gt",
+    "ctx_nll_gt_prob": 0.0,
+    "ctx_nll_weight": null,
+    "ctx_nll_weight_group": [],
+    "ctx_select": null,
+    "ctx_student": "gt",
+    "ctx_student_gt_prob": 0.0,
+    "ctx_teacher": "gt",
+    "ctx_teacher_gt_prob": 0.0,
+    "demo": null,
+    "dev_file": "./data/pretrain/dev.jsonl",
+    "distill_topk": null,
+    "eval_file": null,
+    "exclude_dataset_type": null,
+    "exp_name": "pretrain_icae_16",
+    "exp_note": null,
+    "gradient_accumulation_steps": 12,
+    "gradient_checkpointing": null,
+    "icae_freeze": false,
+    "icae_mem_size": 16,
+    "icae_name_or_path": "Qwen/Qwen3-8B",
+    "icae_projector": null,
+    "index_path": null,
+    "init_codebook": null,
+    "kl_exclude_first_token": false,
+    "kl_temperature": null,
+    "kl_weight": 2.0,
+    "learning_rate": 0.0002,
+    "logging_steps": 1,
+    "lora_param_version": "v4",
+    "lr_scheduler_type": "linear",
+    "max_seq_length": 336,
+    "max_train_samples": 2000000,
+    "max_train_steps": 5209,
+    "model_name_or_path": "Qwen/Qwen3-8B",
+    "multiple_projector": null,
+    "nll_weight": 2.0,
+    "num_chunking": 1,
+    "num_train_epochs": 1,
+    "num_views": 1,
+    "only_first_token": null,
+    "output_dir": "./wandb/run-20250628_093437-4p0ubi4c/files/checkpoint",
+    "overwrite_cache": false,
+    "per_device_train_batch_size": 8,
+    "preprocessing_chat_format": "icae",
+    "preprocessing_num_workers": 16,
+    "prob_masked_kl": 0.0,
+    "project_name": "icae_pretraining",
+    "refer_context": null,
+    "ret_embedding_path": null,
+    "retrieval_context_length": 180,
+    "retriever_name_or_path": null,
+    "same_nll_kl": true,
+    "seed": 980406,
+    "select_criteria": null,
+    "select_gt_ratio": 0.0,
+    "select_negative_ratio": 0.0,
+    "sum_codebook": null,
+    "task_type": "pretrain",
+    "train_file": "./data/pretrain/train.jsonl",
+    "train_shuffle": true,
+    "update_codebook": null,
+    "update_projector_only": true,
+    "use_fast_tokenizer": null,
+    "use_flash_attn": true,
+    "use_rag_tuning": null,
+    "use_xrag_embedding": null,
+    "warmup_ratio": 0.03,
+    "weight_decay": 0.0,
+    "workdir": "."
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "icae_projector": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "mem_size": 16,
+  "model_type": "qwen3",
+  "multiple_projector": null,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "projector_type": "mlp2x_gelu",
+  "retrieval_embed_length": 16,
+  "retriever_hidden_size": 4096,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}