distily_smollm_dataset_sweep / benchmarks.shelve.bak
lapp0's picture
End of training
9269478 verified
'logs/teacher', (0, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8', (512, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=None, dataset_uri=distily_filtered_redpajama_en, per_device_train_batch_size=8', (1024, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, per_device_train_batch_size=8', (1536, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, per_device_train_batch_size=8', (2048, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, learning_rate=6e-05, per_device_train_batch_size=8', (2560, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, learning_rate=6e-05, per_device_train_batch_size=8', (3072, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=4000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8', (3584, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=4000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, learning_rate=6e-05, per_device_train_batch_size=8', (4096, 448)
'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=4000000, dataset_shuffle=True, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8', (4608, 448)