ameerazam08 commited on
Commit
48b9876
1 Parent(s): 9425e9c

Delete wandb

Browse files
wandb/debug-internal.log DELETED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log DELETED
@@ -1,27 +0,0 @@
1
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Configure stats pid to 116027
3
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/.config/wandb/settings
4
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/Documents/Ameer/gemma/wandb/settings
5
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'new.py', 'program_abspath': '/home/rnd/Documents/Ameer/gemma/new.py', 'program': '/home/rnd/Documents/Ameer/gemma/new.py'}
8
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_init.py:_log_setup():526] Logging user logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug.log
9
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_init.py:_log_setup():527] Logging internal logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug-internal.log
10
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():566] calling init triggers
11
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
- config: {}
13
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():616] starting backend
14
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():620] setting up manager
15
- 2024-02-23 03:24:22,982 INFO MainThread:116027 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
- 2024-02-23 03:24:22,984 INFO MainThread:116027 [wandb_init.py:init():628] backend started and connected
17
- 2024-02-23 03:24:22,986 INFO MainThread:116027 [wandb_init.py:init():720] updated telemetry
18
- 2024-02-23 03:24:22,986 INFO MainThread:116027 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
- 2024-02-23 03:24:23,690 INFO MainThread:116027 [wandb_run.py:_on_init():2262] communicating current version
20
- 2024-02-23 03:24:23,800 INFO MainThread:116027 [wandb_run.py:_on_init():2271] got version response
21
- 2024-02-23 03:24:23,800 INFO MainThread:116027 [wandb_init.py:init():804] starting run threads in backend
22
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_console_start():2241] atexit reg
23
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2096] redirect: wrap_raw
24
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2161] Wrapping output streams.
25
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2186] Redirects installed.
26
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_init.py:init():847] run started, returning control to user process
27
- 2024-02-23 03:24:24,872 INFO MainThread:116027 [wandb_run.py:_config_callback():1343] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 2048, 'intermediate_size': 16384, 'num_hidden_layers': 18, 'num_attention_heads': 8, 'head_dim': 256, 'num_key_value_heads': 1, 'hidden_act': 'gelu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 10000.0, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GemmaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 2, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'google/gemma-2b', 'transformers_version': '4.39.0.dev0', 'model_type': 'gemma', 'rope_scaling': None, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': './gemma-jokes-gemma', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 25, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'gemma-jokes-gemma-2024-02-23-03-24', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/conda-environment.yaml DELETED
@@ -1,123 +0,0 @@
1
- name: gemma
2
- channels:
3
- - defaults
4
- dependencies:
5
- - _libgcc_mutex=0.1=main
6
- - _openmp_mutex=5.1=1_gnu
7
- - ca-certificates=2023.12.12=h06a4308_0
8
- - ld_impl_linux-64=2.38=h1181459_1
9
- - libffi=3.4.4=h6a678d5_0
10
- - libgcc-ng=11.2.0=h1234567_1
11
- - libgomp=11.2.0=h1234567_1
12
- - libstdcxx-ng=11.2.0=h1234567_1
13
- - ncurses=6.4=h6a678d5_0
14
- - openssl=3.0.13=h7f8727e_0
15
- - pip=23.3.1=py39h06a4308_0
16
- - python=3.9.18=h955ad1f_0
17
- - readline=8.2=h5eee18b_0
18
- - setuptools=68.2.2=py39h06a4308_0
19
- - sqlite=3.41.2=h5eee18b_0
20
- - tk=8.6.12=h1ccaba5_0
21
- - wheel=0.41.2=py39h06a4308_0
22
- - xz=5.4.5=h5eee18b_0
23
- - zlib=1.2.13=h5eee18b_0
24
- - pip:
25
- - accelerate==0.28.0.dev0
26
- - aiohttp==3.9.3
27
- - aiosignal==1.3.1
28
- - appdirs==1.4.4
29
- - asttokens==2.4.1
30
- - async-timeout==4.0.3
31
- - attrs==23.2.0
32
- - bitsandbytes==0.42.0
33
- - certifi==2024.2.2
34
- - charset-normalizer==3.3.2
35
- - click==8.1.7
36
- - comm==0.2.1
37
- - contourpy==1.2.0
38
- - cycler==0.12.1
39
- - datasets==2.17.1
40
- - decorator==5.1.1
41
- - dill==0.3.8
42
- - docker-pycreds==0.4.0
43
- - exceptiongroup==1.2.0
44
- - executing==2.0.1
45
- - filelock==3.13.1
46
- - fonttools==4.49.0
47
- - frozenlist==1.4.1
48
- - fsspec==2023.10.0
49
- - gitdb==4.0.11
50
- - gitpython==3.1.42
51
- - huggingface-hub==0.20.3
52
- - idna==3.6
53
- - importlib-resources==6.1.1
54
- - ipython==8.18.1
55
- - ipywidgets==8.1.2
56
- - jedi==0.19.1
57
- - jinja2==3.1.3
58
- - jupyterlab-widgets==3.0.10
59
- - kiwisolver==1.4.5
60
- - markupsafe==2.1.5
61
- - matplotlib==3.8.3
62
- - matplotlib-inline==0.1.6
63
- - mpmath==1.3.0
64
- - multidict==6.0.5
65
- - multiprocess==0.70.16
66
- - networkx==3.2.1
67
- - numpy==1.26.4
68
- - nvidia-cublas-cu12==12.1.3.1
69
- - nvidia-cuda-cupti-cu12==12.1.105
70
- - nvidia-cuda-nvrtc-cu12==12.1.105
71
- - nvidia-cuda-runtime-cu12==12.1.105
72
- - nvidia-cudnn-cu12==8.9.2.26
73
- - nvidia-cufft-cu12==11.0.2.54
74
- - nvidia-curand-cu12==10.3.2.106
75
- - nvidia-cusolver-cu12==11.4.5.107
76
- - nvidia-cusparse-cu12==12.1.0.106
77
- - nvidia-nccl-cu12==2.19.3
78
- - nvidia-nvjitlink-cu12==12.3.101
79
- - nvidia-nvtx-cu12==12.1.105
80
- - packaging==23.2
81
- - pandas==2.2.0
82
- - parso==0.8.3
83
- - peft==0.8.2
84
- - pexpect==4.9.0
85
- - pillow==10.2.0
86
- - prompt-toolkit==3.0.43
87
- - protobuf==4.25.3
88
- - psutil==5.9.8
89
- - ptyprocess==0.7.0
90
- - pure-eval==0.2.2
91
- - pyarrow==15.0.0
92
- - pyarrow-hotfix==0.6
93
- - pygments==2.17.2
94
- - pyparsing==3.1.1
95
- - python-dateutil==2.8.2
96
- - pytz==2024.1
97
- - pyyaml==6.0.1
98
- - regex==2023.12.25
99
- - requests==2.31.0
100
- - safetensors==0.4.2
101
- - scipy==1.12.0
102
- - sentry-sdk==1.40.5
103
- - setproctitle==1.3.3
104
- - six==1.16.0
105
- - smmap==5.0.1
106
- - stack-data==0.6.3
107
- - sympy==1.12
108
- - tokenizers==0.15.2
109
- - torch==2.2.1
110
- - tqdm==4.66.2
111
- - traitlets==5.14.1
112
- - transformers==4.39.0.dev0
113
- - triton==2.2.0
114
- - typing-extensions==4.9.0
115
- - tzdata==2024.1
116
- - urllib3==2.2.1
117
- - wandb==0.16.3
118
- - wcwidth==0.2.13
119
- - widgetsnbextension==4.0.10
120
- - xxhash==3.4.1
121
- - yarl==1.9.4
122
- - zipp==3.17.0
123
- prefix: /home/rnd/miniconda3/envs/gemma
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/config.yaml DELETED
@@ -1,682 +0,0 @@
1
- wandb_version: 1
2
-
3
- _wandb:
4
- desc: null
5
- value:
6
- python_version: 3.9.18
7
- cli_version: 0.16.3
8
- framework: huggingface
9
- huggingface_version: 4.39.0.dev0
10
- is_jupyter_run: false
11
- is_kaggle_kernel: false
12
- start_time: 1708638862.984978
13
- t:
14
- 1:
15
- - 1
16
- - 11
17
- - 49
18
- - 51
19
- - 55
20
- - 71
21
- - 98
22
- 2:
23
- - 1
24
- - 11
25
- - 49
26
- - 51
27
- - 55
28
- - 71
29
- - 98
30
- 3:
31
- - 7
32
- - 13
33
- - 23
34
- 4: 3.9.18
35
- 5: 0.16.3
36
- 6: 4.39.0.dev0
37
- 8:
38
- - 5
39
- 9:
40
- 1: transformers_trainer
41
- 13: linux-x86_64
42
- m:
43
- - 1: train/global_step
44
- 6:
45
- - 3
46
- - 1: train/loss
47
- 5: 1
48
- 6:
49
- - 1
50
- - 1: train/grad_norm
51
- 5: 1
52
- 6:
53
- - 1
54
- - 1: train/learning_rate
55
- 5: 1
56
- 6:
57
- - 1
58
- - 1: train/epoch
59
- 5: 1
60
- 6:
61
- - 1
62
- - 1: eval/loss
63
- 5: 1
64
- 6:
65
- - 1
66
- - 1: eval/runtime
67
- 5: 1
68
- 6:
69
- - 1
70
- - 1: eval/samples_per_second
71
- 5: 1
72
- 6:
73
- - 1
74
- - 1: eval/steps_per_second
75
- 5: 1
76
- 6:
77
- - 1
78
- vocab_size:
79
- desc: null
80
- value: 256000
81
- max_position_embeddings:
82
- desc: null
83
- value: 8192
84
- hidden_size:
85
- desc: null
86
- value: 2048
87
- intermediate_size:
88
- desc: null
89
- value: 16384
90
- num_hidden_layers:
91
- desc: null
92
- value: 18
93
- num_attention_heads:
94
- desc: null
95
- value: 8
96
- head_dim:
97
- desc: null
98
- value: 256
99
- num_key_value_heads:
100
- desc: null
101
- value: 1
102
- hidden_act:
103
- desc: null
104
- value: gelu
105
- initializer_range:
106
- desc: null
107
- value: 0.02
108
- rms_norm_eps:
109
- desc: null
110
- value: 1.0e-06
111
- use_cache:
112
- desc: null
113
- value: false
114
- rope_theta:
115
- desc: null
116
- value: 10000.0
117
- attention_bias:
118
- desc: null
119
- value: false
120
- attention_dropout:
121
- desc: null
122
- value: 0.0
123
- return_dict:
124
- desc: null
125
- value: true
126
- output_hidden_states:
127
- desc: null
128
- value: false
129
- output_attentions:
130
- desc: null
131
- value: false
132
- torchscript:
133
- desc: null
134
- value: false
135
- torch_dtype:
136
- desc: null
137
- value: bfloat16
138
- use_bfloat16:
139
- desc: null
140
- value: false
141
- tf_legacy_loss:
142
- desc: null
143
- value: false
144
- pruned_heads:
145
- desc: null
146
- value: {}
147
- tie_word_embeddings:
148
- desc: null
149
- value: true
150
- chunk_size_feed_forward:
151
- desc: null
152
- value: 0
153
- is_encoder_decoder:
154
- desc: null
155
- value: false
156
- is_decoder:
157
- desc: null
158
- value: false
159
- cross_attention_hidden_size:
160
- desc: null
161
- value: null
162
- add_cross_attention:
163
- desc: null
164
- value: false
165
- tie_encoder_decoder:
166
- desc: null
167
- value: false
168
- max_length:
169
- desc: null
170
- value: 20
171
- min_length:
172
- desc: null
173
- value: 0
174
- do_sample:
175
- desc: null
176
- value: false
177
- early_stopping:
178
- desc: null
179
- value: false
180
- num_beams:
181
- desc: null
182
- value: 1
183
- num_beam_groups:
184
- desc: null
185
- value: 1
186
- diversity_penalty:
187
- desc: null
188
- value: 0.0
189
- temperature:
190
- desc: null
191
- value: 1.0
192
- top_k:
193
- desc: null
194
- value: 50
195
- top_p:
196
- desc: null
197
- value: 1.0
198
- typical_p:
199
- desc: null
200
- value: 1.0
201
- repetition_penalty:
202
- desc: null
203
- value: 1.0
204
- length_penalty:
205
- desc: null
206
- value: 1.0
207
- no_repeat_ngram_size:
208
- desc: null
209
- value: 0
210
- encoder_no_repeat_ngram_size:
211
- desc: null
212
- value: 0
213
- bad_words_ids:
214
- desc: null
215
- value: null
216
- num_return_sequences:
217
- desc: null
218
- value: 1
219
- output_scores:
220
- desc: null
221
- value: false
222
- return_dict_in_generate:
223
- desc: null
224
- value: false
225
- forced_bos_token_id:
226
- desc: null
227
- value: null
228
- forced_eos_token_id:
229
- desc: null
230
- value: null
231
- remove_invalid_values:
232
- desc: null
233
- value: false
234
- exponential_decay_length_penalty:
235
- desc: null
236
- value: null
237
- suppress_tokens:
238
- desc: null
239
- value: null
240
- begin_suppress_tokens:
241
- desc: null
242
- value: null
243
- architectures:
244
- desc: null
245
- value:
246
- - GemmaForCausalLM
247
- finetuning_task:
248
- desc: null
249
- value: null
250
- id2label:
251
- desc: null
252
- value:
253
- '0': LABEL_0
254
- '1': LABEL_1
255
- label2id:
256
- desc: null
257
- value:
258
- LABEL_0: 0
259
- LABEL_1: 1
260
- tokenizer_class:
261
- desc: null
262
- value: null
263
- prefix:
264
- desc: null
265
- value: null
266
- bos_token_id:
267
- desc: null
268
- value: 2
269
- pad_token_id:
270
- desc: null
271
- value: 0
272
- eos_token_id:
273
- desc: null
274
- value: 1
275
- sep_token_id:
276
- desc: null
277
- value: null
278
- decoder_start_token_id:
279
- desc: null
280
- value: null
281
- task_specific_params:
282
- desc: null
283
- value: null
284
- problem_type:
285
- desc: null
286
- value: null
287
- _name_or_path:
288
- desc: null
289
- value: google/gemma-2b
290
- transformers_version:
291
- desc: null
292
- value: 4.39.0.dev0
293
- model_type:
294
- desc: null
295
- value: gemma
296
- rope_scaling:
297
- desc: null
298
- value: null
299
- quantization_config:
300
- desc: null
301
- value:
302
- quant_method: QuantizationMethod.BITS_AND_BYTES
303
- _load_in_8bit: false
304
- _load_in_4bit: true
305
- llm_int8_threshold: 6.0
306
- llm_int8_skip_modules: null
307
- llm_int8_enable_fp32_cpu_offload: false
308
- llm_int8_has_fp16_weight: false
309
- bnb_4bit_quant_type: nf4
310
- bnb_4bit_use_double_quant: true
311
- bnb_4bit_compute_dtype: bfloat16
312
- load_in_4bit: true
313
- load_in_8bit: false
314
- output_dir:
315
- desc: null
316
- value: ./gemma-jokes-gemma
317
- overwrite_output_dir:
318
- desc: null
319
- value: false
320
- do_train:
321
- desc: null
322
- value: false
323
- do_eval:
324
- desc: null
325
- value: true
326
- do_predict:
327
- desc: null
328
- value: false
329
- evaluation_strategy:
330
- desc: null
331
- value: steps
332
- prediction_loss_only:
333
- desc: null
334
- value: false
335
- per_device_train_batch_size:
336
- desc: null
337
- value: 2
338
- per_device_eval_batch_size:
339
- desc: null
340
- value: 8
341
- per_gpu_train_batch_size:
342
- desc: null
343
- value: null
344
- per_gpu_eval_batch_size:
345
- desc: null
346
- value: null
347
- gradient_accumulation_steps:
348
- desc: null
349
- value: 1
350
- eval_accumulation_steps:
351
- desc: null
352
- value: null
353
- eval_delay:
354
- desc: null
355
- value: 0
356
- learning_rate:
357
- desc: null
358
- value: 2.5e-05
359
- weight_decay:
360
- desc: null
361
- value: 0.0
362
- adam_beta1:
363
- desc: null
364
- value: 0.9
365
- adam_beta2:
366
- desc: null
367
- value: 0.999
368
- adam_epsilon:
369
- desc: null
370
- value: 1.0e-08
371
- max_grad_norm:
372
- desc: null
373
- value: 1.0
374
- num_train_epochs:
375
- desc: null
376
- value: 3.0
377
- max_steps:
378
- desc: null
379
- value: 500
380
- lr_scheduler_type:
381
- desc: null
382
- value: linear
383
- lr_scheduler_kwargs:
384
- desc: null
385
- value: {}
386
- warmup_ratio:
387
- desc: null
388
- value: 0.0
389
- warmup_steps:
390
- desc: null
391
- value: 1
392
- log_level:
393
- desc: null
394
- value: passive
395
- log_level_replica:
396
- desc: null
397
- value: warning
398
- log_on_each_node:
399
- desc: null
400
- value: true
401
- logging_dir:
402
- desc: null
403
- value: ./logs
404
- logging_strategy:
405
- desc: null
406
- value: steps
407
- logging_first_step:
408
- desc: null
409
- value: false
410
- logging_steps:
411
- desc: null
412
- value: 25
413
- logging_nan_inf_filter:
414
- desc: null
415
- value: true
416
- save_strategy:
417
- desc: null
418
- value: steps
419
- save_steps:
420
- desc: null
421
- value: 25
422
- save_total_limit:
423
- desc: null
424
- value: null
425
- save_safetensors:
426
- desc: null
427
- value: true
428
- save_on_each_node:
429
- desc: null
430
- value: false
431
- save_only_model:
432
- desc: null
433
- value: false
434
- no_cuda:
435
- desc: null
436
- value: false
437
- use_cpu:
438
- desc: null
439
- value: false
440
- use_mps_device:
441
- desc: null
442
- value: false
443
- seed:
444
- desc: null
445
- value: 42
446
- data_seed:
447
- desc: null
448
- value: null
449
- jit_mode_eval:
450
- desc: null
451
- value: false
452
- use_ipex:
453
- desc: null
454
- value: false
455
- bf16:
456
- desc: null
457
- value: true
458
- fp16:
459
- desc: null
460
- value: false
461
- fp16_opt_level:
462
- desc: null
463
- value: O1
464
- half_precision_backend:
465
- desc: null
466
- value: auto
467
- bf16_full_eval:
468
- desc: null
469
- value: false
470
- fp16_full_eval:
471
- desc: null
472
- value: false
473
- tf32:
474
- desc: null
475
- value: null
476
- local_rank:
477
- desc: null
478
- value: 0
479
- ddp_backend:
480
- desc: null
481
- value: null
482
- tpu_num_cores:
483
- desc: null
484
- value: null
485
- tpu_metrics_debug:
486
- desc: null
487
- value: false
488
- debug:
489
- desc: null
490
- value: []
491
- dataloader_drop_last:
492
- desc: null
493
- value: false
494
- eval_steps:
495
- desc: null
496
- value: 25
497
- dataloader_num_workers:
498
- desc: null
499
- value: 0
500
- dataloader_prefetch_factor:
501
- desc: null
502
- value: null
503
- past_index:
504
- desc: null
505
- value: -1
506
- run_name:
507
- desc: null
508
- value: gemma-jokes-gemma-2024-02-23-03-24
509
- disable_tqdm:
510
- desc: null
511
- value: false
512
- remove_unused_columns:
513
- desc: null
514
- value: true
515
- label_names:
516
- desc: null
517
- value: null
518
- load_best_model_at_end:
519
- desc: null
520
- value: false
521
- metric_for_best_model:
522
- desc: null
523
- value: null
524
- greater_is_better:
525
- desc: null
526
- value: null
527
- ignore_data_skip:
528
- desc: null
529
- value: false
530
- fsdp:
531
- desc: null
532
- value: []
533
- fsdp_min_num_params:
534
- desc: null
535
- value: 0
536
- fsdp_config:
537
- desc: null
538
- value:
539
- min_num_params: 0
540
- xla: false
541
- xla_fsdp_v2: false
542
- xla_fsdp_grad_ckpt: false
543
- fsdp_transformer_layer_cls_to_wrap:
544
- desc: null
545
- value: null
546
- accelerator_config:
547
- desc: null
548
- value:
549
- split_batches: false
550
- dispatch_batches: null
551
- even_batches: true
552
- use_seedable_sampler: true
553
- deepspeed:
554
- desc: null
555
- value: null
556
- label_smoothing_factor:
557
- desc: null
558
- value: 0.0
559
- optim:
560
- desc: null
561
- value: paged_adamw_8bit
562
- optim_args:
563
- desc: null
564
- value: null
565
- adafactor:
566
- desc: null
567
- value: false
568
- group_by_length:
569
- desc: null
570
- value: false
571
- length_column_name:
572
- desc: null
573
- value: length
574
- report_to:
575
- desc: null
576
- value:
577
- - wandb
578
- ddp_find_unused_parameters:
579
- desc: null
580
- value: null
581
- ddp_bucket_cap_mb:
582
- desc: null
583
- value: null
584
- ddp_broadcast_buffers:
585
- desc: null
586
- value: null
587
- dataloader_pin_memory:
588
- desc: null
589
- value: true
590
- dataloader_persistent_workers:
591
- desc: null
592
- value: false
593
- skip_memory_metrics:
594
- desc: null
595
- value: true
596
- use_legacy_prediction_loop:
597
- desc: null
598
- value: false
599
- push_to_hub:
600
- desc: null
601
- value: false
602
- resume_from_checkpoint:
603
- desc: null
604
- value: null
605
- hub_model_id:
606
- desc: null
607
- value: null
608
- hub_strategy:
609
- desc: null
610
- value: every_save
611
- hub_token:
612
- desc: null
613
- value: <HUB_TOKEN>
614
- hub_private_repo:
615
- desc: null
616
- value: false
617
- hub_always_push:
618
- desc: null
619
- value: false
620
- gradient_checkpointing:
621
- desc: null
622
- value: true
623
- gradient_checkpointing_kwargs:
624
- desc: null
625
- value: null
626
- include_inputs_for_metrics:
627
- desc: null
628
- value: false
629
- fp16_backend:
630
- desc: null
631
- value: auto
632
- push_to_hub_model_id:
633
- desc: null
634
- value: null
635
- push_to_hub_organization:
636
- desc: null
637
- value: null
638
- push_to_hub_token:
639
- desc: null
640
- value: <PUSH_TO_HUB_TOKEN>
641
- mp_parameters:
642
- desc: null
643
- value: ''
644
- auto_find_batch_size:
645
- desc: null
646
- value: false
647
- full_determinism:
648
- desc: null
649
- value: false
650
- torchdynamo:
651
- desc: null
652
- value: null
653
- ray_scope:
654
- desc: null
655
- value: last
656
- ddp_timeout:
657
- desc: null
658
- value: 1800
659
- torch_compile:
660
- desc: null
661
- value: false
662
- torch_compile_backend:
663
- desc: null
664
- value: null
665
- torch_compile_mode:
666
- desc: null
667
- value: null
668
- dispatch_batches:
669
- desc: null
670
- value: null
671
- split_batches:
672
- desc: null
673
- value: null
674
- include_tokens_per_second:
675
- desc: null
676
- value: false
677
- include_num_input_tokens_seen:
678
- desc: null
679
- value: false
680
- neftune_noise_alpha:
681
- desc: null
682
- value: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/output.log DELETED
@@ -1,1198 +0,0 @@
1
-
2
-
3
- 5%|███▎ | 25/500 [00:03<01:11, 6.69it/s]
4
- {'loss': 4.0257, 'grad_norm': 6.853318691253662, 'learning_rate': 2.3797595190380762e-05, 'epoch': 0.0}
5
-
6
-
7
-
8
-
9
-
10
-
11
-
12
-
13
-
14
-
15
-
16
-
17
-
18
-
19
-
20
-
21
-
22
-
23
-
24
-
25
-
26
-
27
-
28
-
29
-
30
-
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
46
-
47
-
48
-
49
-
50
-
51
-
52
-
53
-
54
-
55
-
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
-
80
-
81
-
82
-
83
-
84
-
85
-
86
-
87
-
88
-
89
-
90
-
91
-
92
-
93
-
94
-
95
-
96
-
97
-
98
-
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
-
109
-
110
-
111
-
112
-
113
-
114
-
115
-
116
-
117
-
118
-
119
-
120
-
121
-
122
-
123
-
124
-
125
-
126
-
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
-
136
-
137
-
138
-
139
-
140
-
141
-
142
-
143
-
144
-
145
-
146
-
147
-
148
-
149
-
150
-
151
-
152
-
153
-
154
-
155
-
156
-
157
-
158
-
159
-
160
-
161
-
162
-
163
-
164
-
165
-
166
-
167
-
168
-
169
-
170
-
171
- 100%|███████████████████████████████████████████████████████████████▊| 2888/2896 [05:34<00:00, 8.65it/s]
172
-
173
-
174
-
175
- 9%|██████▏ | 46/500 [05:44<01:44, 4.35it/s]
176
- 10%|██████▋ | 50/500 [05:44<01:16, 5.90it/s]
177
-
178
-
179
-
180
-
181
-
182
-
183
-
184
-
185
-
186
-
187
-
188
-
189
-
190
-
191
-
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
200
-
201
-
202
-
203
-
204
-
205
-
206
-
207
-
208
-
209
-
210
-
211
-
212
-
213
-
214
-
215
-
216
-
217
-
218
-
219
-
220
-
221
-
222
-
223
-
224
-
225
-
226
-
227
-
228
-
229
-
230
-
231
-
232
-
233
-
234
-
235
-
236
-
237
-
238
-
239
-
240
-
241
-
242
-
243
-
244
-
245
-
246
-
247
-
248
-
249
-
250
-
251
-
252
-
253
-
254
-
255
-
256
-
257
-
258
-
259
-
260
-
261
-
262
-
263
-
264
-
265
-
266
-
267
-
268
-
269
-
270
-
271
-
272
-
273
-
274
-
275
-
276
-
277
-
278
-
279
-
280
-
281
-
282
-
283
-
284
-
285
-
286
-
287
-
288
-
289
-
290
-
291
-
292
-
293
-
294
-
295
-
296
-
297
-
298
-
299
-
300
-
301
-
302
-
303
-
304
-
305
-
306
-
307
-
308
-
309
-
310
-
311
-
312
-
313
-
314
-
315
-
316
-
317
-
318
-
319
-
320
-
321
-
322
-
323
-
324
-
325
-
326
-
327
-
328
-
329
-
330
-
331
-
332
-
333
-
334
-
335
-
336
-
337
-
338
-
339
-
340
-
341
-
342
-
343
-
344
- 100%|███████████████████████████████████████████████████████████████████████████████████▉| 2893/2896 [05:35<00:00, 8.65it/s]
345
-
346
-
347
-
348
- 15%|█████████████ | 75/500 [11:26<01:12, 5.88it/s]
349
- {'loss': 3.2615, 'grad_norm': 5.772947788238525, 'learning_rate': 2.1292585170340683e-05, 'epoch': 0.0}
350
-
351
-
352
-
353
-
354
-
355
-
356
-
357
-
358
-
359
-
360
-
361
-
362
-
363
-
364
-
365
-
366
-
367
-
368
-
369
-
370
-
371
-
372
-
373
-
374
-
375
-
376
-
377
-
378
-
379
-
380
-
381
-
382
-
383
-
384
-
385
-
386
-
387
-
388
-
389
-
390
-
391
-
392
-
393
-
394
-
395
-
396
-
397
-
398
-
399
-
400
-
401
-
402
-
403
-
404
-
405
-
406
-
407
-
408
-
409
-
410
-
411
-
412
-
413
-
414
-
415
-
416
-
417
-
418
-
419
-
420
-
421
-
422
-
423
-
424
-
425
-
426
-
427
-
428
-
429
-
430
-
431
-
432
-
433
-
434
-
435
-
436
-
437
-
438
-
439
-
440
-
441
-
442
-
443
-
444
-
445
-
446
-
447
-
448
-
449
-
450
-
451
-
452
-
453
-
454
-
455
-
456
-
457
-
458
-
459
-
460
-
461
-
462
-
463
-
464
-
465
-
466
-
467
-
468
-
469
-
470
-
471
-
472
-
473
-
474
-
475
-
476
-
477
-
478
-
479
-
480
-
481
-
482
-
483
-
484
-
485
-
486
-
487
-
488
-
489
-
490
-
491
-
492
-
493
-
494
-
495
-
496
-
497
-
498
-
499
-
500
-
501
-
502
-
503
-
504
-
505
-
506
-
507
-
508
-
509
-
510
-
511
-
512
-
513
-
514
-
515
-
516
-
517
- 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2880/2896 [06:00<00:01, 8.69it/s]
518
-
519
-
520
- 20%|█████████████████▏ | 100/500 [17:34<01:08, 5.83it/s]
521
- 0%| | 4/2896 [00:00<04:43, 10.21it/s]
522
-
523
-
524
-
525
-
526
-
527
-
528
-
529
-
530
-
531
-
532
-
533
-
534
-
535
-
536
-
537
-
538
-
539
-
540
-
541
-
542
-
543
-
544
-
545
-
546
-
547
-
548
-
549
-
550
-
551
-
552
-
553
-
554
-
555
-
556
-
557
-
558
-
559
-
560
-
561
-
562
-
563
-
564
-
565
-
566
-
567
-
568
-
569
-
570
-
571
-
572
-
573
-
574
-
575
-
576
-
577
-
578
-
579
-
580
-
581
-
582
-
583
-
584
-
585
-
586
-
587
-
588
-
589
-
590
-
591
-
592
-
593
-
594
-
595
-
596
-
597
-
598
-
599
-
600
-
601
-
602
-
603
-
604
-
605
-
606
-
607
-
608
-
609
-
610
-
611
-
612
-
613
-
614
-
615
-
616
-
617
-
618
-
619
-
620
-
621
-
622
-
623
-
624
-
625
-
626
-
627
-
628
-
629
-
630
-
631
-
632
-
633
-
634
-
635
-
636
-
637
-
638
-
639
-
640
-
641
-
642
-
643
-
644
-
645
-
646
-
647
-
648
-
649
-
650
-
651
-
652
-
653
-
654
-
655
-
656
-
657
-
658
-
659
-
660
-
661
-
662
-
663
-
664
-
665
-
666
-
667
-
668
-
669
-
670
-
671
-
672
-
673
-
674
-
675
-
676
-
677
-
678
-
679
-
680
-
681
-
682
-
683
-
684
-
685
-
686
-
687
-
688
-
689
-
690
-
691
-
692
-
693
- 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2881/2896 [05:40<00:01, 8.69it/s]
694
-
695
-
696
- 23%|████████████████████ | 117/500 [23:21<03:08, 2.03it/s]
697
- 25%|█████████████████████▌ | 125/500 [23:22<01:03, 5.91it/s]
698
-
699
-
700
-
701
-
702
-
703
-
704
-
705
-
706
-
707
-
708
-
709
-
710
-
711
-
712
-
713
-
714
-
715
-
716
-
717
-
718
-
719
-
720
-
721
-
722
-
723
-
724
-
725
-
726
-
727
-
728
-
729
-
730
-
731
-
732
-
733
-
734
-
735
-
736
-
737
-
738
-
739
-
740
-
741
-
742
-
743
-
744
-
745
-
746
-
747
-
748
-
749
-
750
-
751
-
752
-
753
-
754
-
755
-
756
-
757
-
758
-
759
-
760
-
761
-
762
-
763
-
764
-
765
-
766
-
767
-
768
-
769
-
770
-
771
-
772
-
773
-
774
-
775
-
776
-
777
-
778
-
779
-
780
-
781
-
782
-
783
-
784
-
785
-
786
-
787
-
788
-
789
-
790
-
791
-
792
-
793
-
794
-
795
-
796
-
797
-
798
-
799
-
800
-
801
-
802
-
803
-
804
-
805
-
806
-
807
-
808
-
809
-
810
-
811
-
812
-
813
-
814
-
815
-
816
-
817
-
818
-
819
-
820
-
821
-
822
-
823
-
824
-
825
-
826
-
827
-
828
-
829
-
830
-
831
-
832
-
833
-
834
-
835
-
836
-
837
-
838
-
839
-
840
-
841
-
842
-
843
-
844
-
845
-
846
-
847
-
848
-
849
-
850
-
851
-
852
-
853
-
854
-
855
-
856
-
857
-
858
-
859
-
860
-
861
-
862
-
863
-
864
-
865
-
866
-
867
- 100%|███████████████████████████████████████████████████████████████████████████████████▉| 2894/2896 [05:39<00:00, 8.67it/s]
868
-
869
-
870
- 30%|█████████████████████████▊ | 150/500 [29:07<00:59, 5.91it/s]
871
- 0%| | 2/2896 [00:00<02:46, 17.41it/s]
872
-
873
-
874
-
875
-
876
-
877
-
878
-
879
-
880
-
881
-
882
-
883
-
884
-
885
-
886
-
887
-
888
-
889
-
890
-
891
-
892
-
893
-
894
-
895
-
896
-
897
-
898
-
899
-
900
-
901
-
902
-
903
-
904
-
905
-
906
-
907
-
908
-
909
-
910
-
911
-
912
-
913
-
914
-
915
-
916
-
917
-
918
-
919
-
920
-
921
-
922
-
923
-
924
-
925
-
926
-
927
-
928
-
929
-
930
-
931
-
932
-
933
-
934
-
935
-
936
-
937
-
938
-
939
-
940
-
941
-
942
-
943
-
944
-
945
-
946
-
947
-
948
-
949
-
950
-
951
-
952
-
953
-
954
-
955
-
956
-
957
-
958
-
959
-
960
-
961
-
962
-
963
-
964
-
965
-
966
-
967
-
968
-
969
-
970
-
971
-
972
-
973
-
974
-
975
-
976
-
977
-
978
-
979
-
980
-
981
-
982
-
983
-
984
-
985
-
986
-
987
-
988
-
989
-
990
-
991
-
992
-
993
-
994
-
995
-
996
-
997
-
998
-
999
-
1000
-
1001
-
1002
-
1003
-
1004
-
1005
-
1006
-
1007
-
1008
-
1009
-
1010
-
1011
-
1012
-
1013
-
1014
-
1015
-
1016
-
1017
-
1018
-
1019
-
1020
-
1021
-
1022
-
1023
-
1024
-
1025
-
1026
-
1027
-
1028
-
1029
-
1030
-
1031
-
1032
-
1033
-
1034
-
1035
-
1036
-
1037
-
1038
-
1039
-
1040
-
1041
-
1042
- 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2880/2896 [05:38<00:01, 8.68it/s]
1043
-
1044
-
1045
- 35%|██████████████████████████████ | 175/500 [34:54<00:55, 5.88it/s]
1046
- 0%| | 4/2896 [00:00<04:25, 10.91it/s]
1047
-
1048
-
1049
-
1050
-
1051
-
1052
-
1053
-
1054
-
1055
-
1056
-
1057
-
1058
-
1059
-
1060
-
1061
-
1062
-
1063
-
1064
-
1065
-
1066
-
1067
-
1068
-
1069
-
1070
-
1071
-
1072
-
1073
-
1074
-
1075
-
1076
-
1077
-
1078
-
1079
-
1080
-
1081
-
1082
-
1083
-
1084
-
1085
-
1086
-
1087
-
1088
-
1089
-
1090
-
1091
-
1092
-
1093
-
1094
-
1095
-
1096
-
1097
-
1098
-
1099
-
1100
-
1101
-
1102
-
1103
-
1104
-
1105
-
1106
-
1107
-
1108
-
1109
-
1110
-
1111
-
1112
-
1113
-
1114
-
1115
-
1116
-
1117
-
1118
-
1119
-
1120
-
1121
-
1122
-
1123
-
1124
-
1125
-
1126
-
1127
-
1128
-
1129
-
1130
-
1131
-
1132
-
1133
-
1134
-
1135
-
1136
-
1137
-
1138
-
1139
-
1140
-
1141
-
1142
-
1143
-
1144
-
1145
-
1146
-
1147
-
1148
-
1149
-
1150
- File "/home/rnd/Documents/Ameer/gemma/new.py", line 184, in <module> | 1763/2896 [03:27<02:10, 8.69it/s]
1151
- trainer.train()
1152
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 1624, in train
1153
- return inner_training_loop(
1154
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2029, in _inner_training_loop
1155
- self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
1156
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2412, in _maybe_log_save_evaluate
1157
- metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
1158
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3229, in evaluate
1159
- output = eval_loop(
1160
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3418, in evaluation_loop
1161
- loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
1162
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3635, in prediction_step
1163
- loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
1164
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2925, in compute_loss
1165
- outputs = model(**inputs)
1166
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
1167
- return self._call_impl(*args, **kwargs)
1168
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
1169
- return forward_call(*args, **kwargs)
1170
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/utils/operations.py", line 829, in forward
1171
- return model_forward(*args, **kwargs)
1172
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/utils/operations.py", line 817, in __call__
1173
- return convert_to_fp32(self.model_forward(*args, **kwargs))
1174
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
1175
- return func(*args, **kwargs)
1176
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/peft/peft_model.py", line 1091, in forward
1177
- return self.base_model(
1178
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
1179
- return self._call_impl(*args, **kwargs)
1180
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
1181
- return forward_call(*args, **kwargs)
1182
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 160, in forward
1183
- return self.model.forward(*args, **kwargs)
1184
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
1185
- output = module._old_forward(*args, **kwargs)
1186
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 1070, in forward
1187
- outputs = self.model(
1188
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
1189
- return self._call_impl(*args, **kwargs)
1190
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
1191
- return forward_call(*args, **kwargs)
1192
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
1193
- output = module._old_forward(*args, **kwargs)
1194
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 875, in forward
1195
- causal_mask = self._update_causal_mask(attention_mask, inputs_embeds)
1196
- File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 979, in _update_causal_mask
1197
- if not is_tracing and torch.any(attention_mask != 1):
1198
- KeyboardInterrupt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/requirements.txt DELETED
@@ -1,101 +0,0 @@
1
- accelerate==0.28.0.dev0
2
- aiohttp==3.9.3
3
- aiosignal==1.3.1
4
- appdirs==1.4.4
5
- asttokens==2.4.1
6
- async-timeout==4.0.3
7
- attrs==23.2.0
8
- bitsandbytes==0.42.0
9
- certifi==2024.2.2
10
- charset-normalizer==3.3.2
11
- click==8.1.7
12
- comm==0.2.1
13
- contourpy==1.2.0
14
- cycler==0.12.1
15
- datasets==2.17.1
16
- decorator==5.1.1
17
- dill==0.3.8
18
- docker-pycreds==0.4.0
19
- exceptiongroup==1.2.0
20
- executing==2.0.1
21
- filelock==3.13.1
22
- fonttools==4.49.0
23
- frozenlist==1.4.1
24
- fsspec==2023.10.0
25
- gitdb==4.0.11
26
- gitpython==3.1.42
27
- huggingface-hub==0.20.3
28
- idna==3.6
29
- importlib-resources==6.1.1
30
- ipython==8.18.1
31
- ipywidgets==8.1.2
32
- jedi==0.19.1
33
- jinja2==3.1.3
34
- jupyterlab-widgets==3.0.10
35
- kiwisolver==1.4.5
36
- markupsafe==2.1.5
37
- matplotlib-inline==0.1.6
38
- matplotlib==3.8.3
39
- mpmath==1.3.0
40
- multidict==6.0.5
41
- multiprocess==0.70.16
42
- networkx==3.2.1
43
- numpy==1.26.4
44
- nvidia-cublas-cu12==12.1.3.1
45
- nvidia-cuda-cupti-cu12==12.1.105
46
- nvidia-cuda-nvrtc-cu12==12.1.105
47
- nvidia-cuda-runtime-cu12==12.1.105
48
- nvidia-cudnn-cu12==8.9.2.26
49
- nvidia-cufft-cu12==11.0.2.54
50
- nvidia-curand-cu12==10.3.2.106
51
- nvidia-cusolver-cu12==11.4.5.107
52
- nvidia-cusparse-cu12==12.1.0.106
53
- nvidia-nccl-cu12==2.19.3
54
- nvidia-nvjitlink-cu12==12.3.101
55
- nvidia-nvtx-cu12==12.1.105
56
- packaging==23.2
57
- pandas==2.2.0
58
- parso==0.8.3
59
- peft==0.8.2
60
- pexpect==4.9.0
61
- pillow==10.2.0
62
- pip==23.3.1
63
- prompt-toolkit==3.0.43
64
- protobuf==4.25.3
65
- psutil==5.9.8
66
- ptyprocess==0.7.0
67
- pure-eval==0.2.2
68
- pyarrow-hotfix==0.6
69
- pyarrow==15.0.0
70
- pygments==2.17.2
71
- pyparsing==3.1.1
72
- python-dateutil==2.8.2
73
- pytz==2024.1
74
- pyyaml==6.0.1
75
- regex==2023.12.25
76
- requests==2.31.0
77
- safetensors==0.4.2
78
- scipy==1.12.0
79
- sentry-sdk==1.40.5
80
- setproctitle==1.3.3
81
- setuptools==68.2.2
82
- six==1.16.0
83
- smmap==5.0.1
84
- stack-data==0.6.3
85
- sympy==1.12
86
- tokenizers==0.15.2
87
- torch==2.2.1
88
- tqdm==4.66.2
89
- traitlets==5.14.1
90
- transformers==4.39.0.dev0
91
- triton==2.2.0
92
- typing-extensions==4.9.0
93
- tzdata==2024.1
94
- urllib3==2.2.1
95
- wandb==0.16.3
96
- wcwidth==0.2.13
97
- wheel==0.41.2
98
- widgetsnbextension==4.0.10
99
- xxhash==3.4.1
100
- yarl==1.9.4
101
- zipp==3.17.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/wandb-metadata.json DELETED
@@ -1,202 +0,0 @@
1
- {
2
- "os": "Linux-5.4.0-172-generic-x86_64-with-glibc2.31",
3
- "python": "3.9.18",
4
- "heartbeatAt": "2024-02-22T21:54:23.837753",
5
- "startedAt": "2024-02-22T21:54:22.976651",
6
- "docker": null,
7
- "cuda": null,
8
- "args": [],
9
- "state": "running",
10
- "program": "/home/rnd/Documents/Ameer/gemma/new.py",
11
- "codePathLocal": "new.py",
12
- "codePath": "new.py",
13
- "host": "rnd-System-Product-Name",
14
- "username": "rnd",
15
- "executable": "/home/rnd/miniconda3/envs/gemma/bin/python",
16
- "cpu_count": 24,
17
- "cpu_count_logical": 32,
18
- "cpu_freq": {
19
- "current": 4938.592250000001,
20
- "min": 800.0,
21
- "max": 5700.0
22
- },
23
- "cpu_freq_per_core": [
24
- {
25
- "current": 5500.0,
26
- "min": 800.0,
27
- "max": 7000.0
28
- },
29
- {
30
- "current": 5500.0,
31
- "min": 800.0,
32
- "max": 7000.0
33
- },
34
- {
35
- "current": 5500.0,
36
- "min": 800.0,
37
- "max": 7000.0
38
- },
39
- {
40
- "current": 5504.691,
41
- "min": 800.0,
42
- "max": 7000.0
43
- },
44
- {
45
- "current": 5500.0,
46
- "min": 800.0,
47
- "max": 7000.0
48
- },
49
- {
50
- "current": 5526.894,
51
- "min": 800.0,
52
- "max": 7000.0
53
- },
54
- {
55
- "current": 5500.0,
56
- "min": 800.0,
57
- "max": 7000.0
58
- },
59
- {
60
- "current": 5516.923,
61
- "min": 800.0,
62
- "max": 7000.0
63
- },
64
- {
65
- "current": 5800.0,
66
- "min": 800.0,
67
- "max": 7400.0
68
- },
69
- {
70
- "current": 5819.727,
71
- "min": 800.0,
72
- "max": 7400.0
73
- },
74
- {
75
- "current": 5684.191,
76
- "min": 800.0,
77
- "max": 7400.0
78
- },
79
- {
80
- "current": 5815.223,
81
- "min": 800.0,
82
- "max": 7400.0
83
- },
84
- {
85
- "current": 5500.0,
86
- "min": 800.0,
87
- "max": 7000.0
88
- },
89
- {
90
- "current": 5500.0,
91
- "min": 800.0,
92
- "max": 7000.0
93
- },
94
- {
95
- "current": 5500.0,
96
- "min": 800.0,
97
- "max": 7000.0
98
- },
99
- {
100
- "current": 5536.423,
101
- "min": 800.0,
102
- "max": 7000.0
103
- },
104
- {
105
- "current": 4296.422,
106
- "min": 800.0,
107
- "max": 4300.0
108
- },
109
- {
110
- "current": 4302.26,
111
- "min": 800.0,
112
- "max": 4300.0
113
- },
114
- {
115
- "current": 4292.475,
116
- "min": 800.0,
117
- "max": 4300.0
118
- },
119
- {
120
- "current": 4279.054,
121
- "min": 800.0,
122
- "max": 4300.0
123
- },
124
- {
125
- "current": 4283.433,
126
- "min": 800.0,
127
- "max": 4300.0
128
- },
129
- {
130
- "current": 4300.044,
131
- "min": 800.0,
132
- "max": 4300.0
133
- },
134
- {
135
- "current": 4368.421,
136
- "min": 800.0,
137
- "max": 4300.0
138
- },
139
- {
140
- "current": 4280.523,
141
- "min": 800.0,
142
- "max": 4300.0
143
- },
144
- {
145
- "current": 4291.353,
146
- "min": 800.0,
147
- "max": 4300.0
148
- },
149
- {
150
- "current": 4314.482,
151
- "min": 800.0,
152
- "max": 4300.0
153
- },
154
- {
155
- "current": 4299.578,
156
- "min": 800.0,
157
- "max": 4300.0
158
- },
159
- {
160
- "current": 4300.072,
161
- "min": 800.0,
162
- "max": 4300.0
163
- },
164
- {
165
- "current": 4301.981,
166
- "min": 800.0,
167
- "max": 4300.0
168
- },
169
- {
170
- "current": 4311.285,
171
- "min": 800.0,
172
- "max": 4300.0
173
- },
174
- {
175
- "current": 4302.597,
176
- "min": 800.0,
177
- "max": 4300.0
178
- },
179
- {
180
- "current": 4306.9,
181
- "min": 800.0,
182
- "max": 4300.0
183
- }
184
- ],
185
- "disk": {
186
- "/": {
187
- "total": 1832.2072448730469,
188
- "used": 1698.8227272033691
189
- }
190
- },
191
- "gpu": "NVIDIA GeForce RTX 3090 Ti",
192
- "gpu_count": 1,
193
- "gpu_devices": [
194
- {
195
- "name": "NVIDIA GeForce RTX 3090 Ti",
196
- "memory_total": 25757220864
197
- }
198
- ],
199
- "memory": {
200
- "total": 62.508731842041016
201
- }
202
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/files/wandb-summary.json DELETED
@@ -1 +0,0 @@
1
- {"train/loss": 2.9901, "train/grad_norm": 5.66660213470459, "train/learning_rate": 1.628256513026052e-05, "train/epoch": 0.0, "train/global_step": 175, "_timestamp": 1708640958.932987, "_runtime": 2095.9480090141296, "_step": 12, "eval/loss": 3.162160873413086, "eval/runtime": 340.5303, "eval/samples_per_second": 68.029, "eval/steps_per_second": 8.504, "_wandb": {"runtime": 2302}}
 
 
wandb/run-20240223_032422-b657btrg/logs/debug-internal.log DELETED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240223_032422-b657btrg/logs/debug.log DELETED
@@ -1,27 +0,0 @@
1
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
2
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Configure stats pid to 116027
3
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/.config/wandb/settings
4
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/Documents/Ameer/gemma/wandb/settings
5
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'new.py', 'program_abspath': '/home/rnd/Documents/Ameer/gemma/new.py', 'program': '/home/rnd/Documents/Ameer/gemma/new.py'}
8
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_init.py:_log_setup():526] Logging user logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug.log
9
- 2024-02-23 03:24:22,980 INFO MainThread:116027 [wandb_init.py:_log_setup():527] Logging internal logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug-internal.log
10
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():566] calling init triggers
11
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
- config: {}
13
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():616] starting backend
14
- 2024-02-23 03:24:22,981 INFO MainThread:116027 [wandb_init.py:init():620] setting up manager
15
- 2024-02-23 03:24:22,982 INFO MainThread:116027 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
- 2024-02-23 03:24:22,984 INFO MainThread:116027 [wandb_init.py:init():628] backend started and connected
17
- 2024-02-23 03:24:22,986 INFO MainThread:116027 [wandb_init.py:init():720] updated telemetry
18
- 2024-02-23 03:24:22,986 INFO MainThread:116027 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
- 2024-02-23 03:24:23,690 INFO MainThread:116027 [wandb_run.py:_on_init():2262] communicating current version
20
- 2024-02-23 03:24:23,800 INFO MainThread:116027 [wandb_run.py:_on_init():2271] got version response
21
- 2024-02-23 03:24:23,800 INFO MainThread:116027 [wandb_init.py:init():804] starting run threads in backend
22
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_console_start():2241] atexit reg
23
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2096] redirect: wrap_raw
24
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2161] Wrapping output streams.
25
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_run.py:_redirect():2186] Redirects installed.
26
- 2024-02-23 03:24:24,871 INFO MainThread:116027 [wandb_init.py:init():847] run started, returning control to user process
27
- 2024-02-23 03:24:24,872 INFO MainThread:116027 [wandb_run.py:_config_callback():1343] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 2048, 'intermediate_size': 16384, 'num_hidden_layers': 18, 'num_attention_heads': 8, 'head_dim': 256, 'num_key_value_heads': 1, 'hidden_act': 'gelu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 10000.0, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GemmaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 2, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'google/gemma-2b', 'transformers_version': '4.39.0.dev0', 'model_type': 'gemma', 'rope_scaling': None, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': './gemma-jokes-gemma', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 25, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'gemma-jokes-gemma-2024-02-23-03-24', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20240223_032422-b657btrg/run-b657btrg.wandb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d50af94be5df0d2db6e804280eafdecb87c0ad05f6e33f9783c5396a91cc8b25
3
- size 8654393