eunseong commited on
Commit
88f668a
·
verified ·
1 Parent(s): 4a4967f

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. ckpt.pth +3 -0
  2. config.json +135 -0
ckpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36441afd84ee5c1abb18a766c4039ebbac71ceb050054e7038d1fe7438dbbea7
3
+ size 87719689
config.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "args_ours": {
6
+ "RQVAE_path": null,
7
+ "alpha_codebook": null,
8
+ "alpha_contrastive": 0.0,
9
+ "alpha_kl": null,
10
+ "alpha_masked_kl": 0.0,
11
+ "alpha_multi_task": 0.0,
12
+ "alpha_nll": 1.0,
13
+ "alpha_ortho": null,
14
+ "alpha_paraphrase": 0.0,
15
+ "alpha_rqvae": null,
16
+ "answers_path": null,
17
+ "assist_content_kl": "content",
18
+ "assist_content_nll": "content",
19
+ "base_model": null,
20
+ "built_in_rqvae": null,
21
+ "chat_format": "icae",
22
+ "checkpoint_path": null,
23
+ "checkpointing_steps": 1000,
24
+ "clip_grad_norm": -1.0,
25
+ "codebook_force": null,
26
+ "completed_steps": null,
27
+ "config": "config/language_modeling/pretrain_icae_qwen.yaml",
28
+ "ctx_change": "gt",
29
+ "ctx_change_ratio": 2.0,
30
+ "ctx_change_shuffle_ratio": 2.0,
31
+ "ctx_kl_weight": null,
32
+ "ctx_kl_weight_group": [],
33
+ "ctx_nll": "gt",
34
+ "ctx_nll_gt_prob": 0.0,
35
+ "ctx_nll_weight": null,
36
+ "ctx_nll_weight_group": [],
37
+ "ctx_select": null,
38
+ "ctx_student": "gt",
39
+ "ctx_student_gt_prob": 0.0,
40
+ "ctx_teacher": "gt",
41
+ "ctx_teacher_gt_prob": 0.0,
42
+ "demo": null,
43
+ "dev_file": "./data/pretrain/dev.jsonl",
44
+ "distill_topk": null,
45
+ "eval_file": null,
46
+ "exclude_dataset_type": null,
47
+ "exp_name": "pretrain_icae_16",
48
+ "exp_note": null,
49
+ "gradient_accumulation_steps": 12,
50
+ "gradient_checkpointing": null,
51
+ "icae_freeze": false,
52
+ "icae_mem_size": 16,
53
+ "icae_name_or_path": "Qwen/Qwen3-8B",
54
+ "icae_projector": null,
55
+ "index_path": null,
56
+ "init_codebook": null,
57
+ "kl_exclude_first_token": false,
58
+ "kl_temperature": null,
59
+ "kl_weight": 2.0,
60
+ "learning_rate": 0.0002,
61
+ "logging_steps": 1,
62
+ "lora_param_version": "v4",
63
+ "lr_scheduler_type": "linear",
64
+ "max_seq_length": 336,
65
+ "max_train_samples": 2000000,
66
+ "max_train_steps": 5209,
67
+ "model_name_or_path": "Qwen/Qwen3-8B",
68
+ "multiple_projector": null,
69
+ "nll_weight": 2.0,
70
+ "num_chunking": 1,
71
+ "num_train_epochs": 1,
72
+ "num_views": 1,
73
+ "only_first_token": null,
74
+ "output_dir": "./wandb/run-20250628_093437-4p0ubi4c/files/checkpoint",
75
+ "overwrite_cache": false,
76
+ "per_device_train_batch_size": 8,
77
+ "preprocessing_chat_format": "icae",
78
+ "preprocessing_num_workers": 16,
79
+ "prob_masked_kl": 0.0,
80
+ "project_name": "icae_pretraining",
81
+ "refer_context": null,
82
+ "ret_embedding_path": null,
83
+ "retrieval_context_length": 180,
84
+ "retriever_name_or_path": null,
85
+ "same_nll_kl": true,
86
+ "seed": 980406,
87
+ "select_criteria": null,
88
+ "select_gt_ratio": 0.0,
89
+ "select_negative_ratio": 0.0,
90
+ "sum_codebook": null,
91
+ "task_type": "pretrain",
92
+ "train_file": "./data/pretrain/train.jsonl",
93
+ "train_shuffle": true,
94
+ "update_codebook": null,
95
+ "update_projector_only": true,
96
+ "use_fast_tokenizer": null,
97
+ "use_flash_attn": true,
98
+ "use_rag_tuning": null,
99
+ "use_xrag_embedding": null,
100
+ "warmup_ratio": 0.03,
101
+ "weight_decay": 0.0,
102
+ "workdir": "."
103
+ },
104
+ "attention_bias": false,
105
+ "attention_dropout": 0.0,
106
+ "bos_token_id": 151643,
107
+ "eos_token_id": 151645,
108
+ "head_dim": 128,
109
+ "hidden_act": "silu",
110
+ "hidden_size": 4096,
111
+ "icae_projector": null,
112
+ "initializer_range": 0.02,
113
+ "intermediate_size": 12288,
114
+ "max_position_embeddings": 40960,
115
+ "max_window_layers": 36,
116
+ "mem_size": 16,
117
+ "model_type": "qwen3",
118
+ "multiple_projector": null,
119
+ "num_attention_heads": 32,
120
+ "num_hidden_layers": 36,
121
+ "num_key_value_heads": 8,
122
+ "projector_type": "mlp2x_gelu",
123
+ "retrieval_embed_length": 16,
124
+ "retriever_hidden_size": 4096,
125
+ "rms_norm_eps": 1e-06,
126
+ "rope_scaling": null,
127
+ "rope_theta": 1000000,
128
+ "sliding_window": null,
129
+ "tie_word_embeddings": false,
130
+ "torch_dtype": "bfloat16",
131
+ "transformers_version": "4.51.0",
132
+ "use_cache": true,
133
+ "use_sliding_window": false,
134
+ "vocab_size": 151936
135
+ }