'logs/teacher', (0, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8', (512, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=None, dataset_uri=distily_filtered_redpajama_en, per_device_train_batch_size=8', (1024, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, per_device_train_batch_size=8', (1536, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, per_device_train_batch_size=8', (2048, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, learning_rate=6e-05, per_device_train_batch_size=8', (2560, 448) 'distily_smollm_dataset_sweep/logs/dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, learning_rate=6e-05, per_device_train_batch_size=8', (3072, 448)