minor tweaks to simplify (#597)
Browse files
src/axolotl/utils/tokenization.py
CHANGED
@@ -18,21 +18,16 @@ def check_example_labels(example, tokenizer, text_only=False):
|
|
18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
19 |
input_ids = example["input_ids"]
|
20 |
labels = example["labels"]
|
21 |
-
attention_mask = example["attention_mask"]
|
22 |
|
23 |
# You can compare the input_ids and labels element-wise
|
24 |
# Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
|
25 |
colored_tokens = []
|
26 |
-
for _, (input_id, label_id
|
27 |
-
zip(input_ids, labels, attention_mask)
|
28 |
-
):
|
29 |
decoded_input_token = tokenizer.decode(input_id)
|
30 |
# Choose the color based on whether the label has the ignore value or not
|
31 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
32 |
colored_token = colored(decoded_input_token, color) + (
|
33 |
-
not text_only
|
34 |
-
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
35 |
-
or ""
|
36 |
)
|
37 |
colored_tokens.append(colored_token)
|
38 |
|
|
|
18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
19 |
input_ids = example["input_ids"]
|
20 |
labels = example["labels"]
|
|
|
21 |
|
22 |
# You can compare the input_ids and labels element-wise
|
23 |
# Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
|
24 |
colored_tokens = []
|
25 |
+
for _, (input_id, label_id) in enumerate(zip(input_ids, labels)):
|
|
|
|
|
26 |
decoded_input_token = tokenizer.decode(input_id)
|
27 |
# Choose the color based on whether the label has the ignore value or not
|
28 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
29 |
colored_token = colored(decoded_input_token, color) + (
|
30 |
+
not text_only and colored(f"({label_id}, {input_id})", "white") or ""
|
|
|
|
|
31 |
)
|
32 |
colored_tokens.append(colored_token)
|
33 |
|
src/axolotl/utils/trainer.py
CHANGED
@@ -429,7 +429,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|
429 |
.apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
|
430 |
.values
|
431 |
)
|
432 |
-
LOG.info(f"
|
433 |
cfg.total_num_tokens = total_num_tokens
|
434 |
|
435 |
if not cfg.total_supervised_tokens:
|
@@ -489,6 +489,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|
489 |
data_loader_len = data_loader.len_w_stats()
|
490 |
actual_eff = data_loader.efficiency()
|
491 |
LOG.info(f"data_loader_len: {data_loader_len}")
|
|
|
|
|
492 |
total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
|
493 |
|
494 |
def calc_sample_packing_eff_est(estimates: List[float]):
|
@@ -502,10 +504,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|
502 |
sample_packing_eff_est = (
|
503 |
math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
|
504 |
)
|
505 |
-
LOG.info(
|
506 |
-
f"π UPDATE CONFIG WITH: `sample_packing_eff_est: {sample_packing_eff_est}`"
|
507 |
-
)
|
508 |
cfg.sample_packing_eff_est = sample_packing_eff_est
|
|
|
509 |
else:
|
510 |
total_num_steps = int(
|
511 |
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
|
|
|
429 |
.apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
|
430 |
.values
|
431 |
)
|
432 |
+
LOG.info(f"total_num_tokens: {total_num_tokens}")
|
433 |
cfg.total_num_tokens = total_num_tokens
|
434 |
|
435 |
if not cfg.total_supervised_tokens:
|
|
|
489 |
data_loader_len = data_loader.len_w_stats()
|
490 |
actual_eff = data_loader.efficiency()
|
491 |
LOG.info(f"data_loader_len: {data_loader_len}")
|
492 |
+
# FIXME: is there a bug here somewhere? the total num steps depends
|
493 |
+
# on the agreed on value for sample_packing_eff_est
|
494 |
total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
|
495 |
|
496 |
def calc_sample_packing_eff_est(estimates: List[float]):
|
|
|
504 |
sample_packing_eff_est = (
|
505 |
math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
|
506 |
)
|
|
|
|
|
|
|
507 |
cfg.sample_packing_eff_est = sample_packing_eff_est
|
508 |
+
LOG.info(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
|
509 |
else:
|
510 |
total_num_steps = int(
|
511 |
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
|