casperhansen winglian commited on
Commit
6840381
1 Parent(s): cda52dc

Add desc to map/filter (#1162)

Browse files

* Add desc to map/filter

* update descriptions

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>

src/axolotl/cli/__init__.py CHANGED
@@ -410,7 +410,10 @@ def load_rl_datasets(
410
  for i, data_set in enumerate(train_datasets):
411
  _type = cfg.datasets[i]["type"]
412
  ds_type_fn = locals()[_type]
413
- train_datasets[i] = data_set.map(ds_type_fn)
 
 
 
414
  train_dataset = concatenate_datasets(train_datasets)
415
 
416
  # eval_dataset = eval_dataset.map(intel_apply_chatml)
 
410
  for i, data_set in enumerate(train_datasets):
411
  _type = cfg.datasets[i]["type"]
412
  ds_type_fn = locals()[_type]
413
+ train_datasets[i] = data_set.map(
414
+ ds_type_fn,
415
+ desc="Mapping RL Dataset",
416
+ )
417
  train_dataset = concatenate_datasets(train_datasets)
418
 
419
  # eval_dataset = eval_dataset.map(intel_apply_chatml)
src/axolotl/datasets.py CHANGED
@@ -57,6 +57,7 @@ class TokenizedPromptDataset(Dataset):
57
  num_proc=num_proc,
58
  remove_columns=features,
59
  keep_in_memory=self.keep_in_memory,
 
60
  **map_kwargs,
61
  )
62
 
 
57
  num_proc=num_proc,
58
  remove_columns=features,
59
  keep_in_memory=self.keep_in_memory,
60
+ desc="Tokenizing Prompts",
61
  **map_kwargs,
62
  )
63
 
src/axolotl/utils/data.py CHANGED
@@ -792,6 +792,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
792
  # remove all the existing columns after mapping since they end up having
793
  # a different length than the encoded/tokenized column
794
  remove_columns=dataset.features.keys(),
 
795
  )
796
  return dataset
797
 
 
792
  # remove all the existing columns after mapping since they end up having
793
  # a different length than the encoded/tokenized column
794
  remove_columns=dataset.features.keys(),
795
+ desc="Encoding Pretraining",
796
  )
797
  return dataset
798
 
src/axolotl/utils/trainer.py CHANGED
@@ -134,12 +134,14 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
134
  drop_long,
135
  num_proc=cfg.dataset_processes,
136
  load_from_cache_file=not cfg.is_preprocess,
 
137
  )
138
  if eval_dataset:
139
  eval_dataset = eval_dataset.filter(
140
  drop_long,
141
  num_proc=cfg.dataset_processes,
142
  load_from_cache_file=not cfg.is_preprocess,
 
143
  )
144
 
145
  if cfg.group_by_length:
@@ -147,6 +149,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
147
  add_length,
148
  num_proc=cfg.dataset_processes,
149
  load_from_cache_file=not cfg.is_preprocess,
 
150
  )
151
 
152
  if cfg.sample_packing:
@@ -154,6 +157,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
154
  add_position_ids,
155
  num_proc=cfg.dataset_processes,
156
  load_from_cache_file=not cfg.is_preprocess,
 
157
  )
158
  if cfg.eval_sample_packing is not False:
159
  if eval_dataset:
@@ -161,6 +165,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
161
  add_position_ids,
162
  num_proc=cfg.dataset_processes,
163
  load_from_cache_file=not cfg.is_preprocess,
 
164
  )
165
 
166
  return train_dataset, eval_dataset
@@ -169,9 +174,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
169
  def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
170
  drop_long = partial(drop_long_seq, sequence_len=sequence_len)
171
 
172
- train_dataset = train_dataset.filter(drop_long)
 
 
 
173
  train_dataset = train_dataset.map(
174
  add_position_ids,
 
175
  )
176
  return train_dataset
177
 
 
134
  drop_long,
135
  num_proc=cfg.dataset_processes,
136
  load_from_cache_file=not cfg.is_preprocess,
137
+ desc="Dropping Long Sequences",
138
  )
139
  if eval_dataset:
140
  eval_dataset = eval_dataset.filter(
141
  drop_long,
142
  num_proc=cfg.dataset_processes,
143
  load_from_cache_file=not cfg.is_preprocess,
144
+ desc="Dropping Long Sequences",
145
  )
146
 
147
  if cfg.group_by_length:
 
149
  add_length,
150
  num_proc=cfg.dataset_processes,
151
  load_from_cache_file=not cfg.is_preprocess,
152
+ desc="Group By Length",
153
  )
154
 
155
  if cfg.sample_packing:
 
157
  add_position_ids,
158
  num_proc=cfg.dataset_processes,
159
  load_from_cache_file=not cfg.is_preprocess,
160
+ desc="Add position_id column (Sample Packing)",
161
  )
162
  if cfg.eval_sample_packing is not False:
163
  if eval_dataset:
 
165
  add_position_ids,
166
  num_proc=cfg.dataset_processes,
167
  load_from_cache_file=not cfg.is_preprocess,
168
+ desc="Add position_id column (Sample Packing)",
169
  )
170
 
171
  return train_dataset, eval_dataset
 
174
  def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
175
  drop_long = partial(drop_long_seq, sequence_len=sequence_len)
176
 
177
+ train_dataset = train_dataset.filter(
178
+ drop_long,
179
+ desc="Dropping Long Sequences",
180
+ )
181
  train_dataset = train_dataset.map(
182
  add_position_ids,
183
+ desc="Add position_id column (Pretraining Sample Packing)",
184
  )
185
  return train_dataset
186