|
|
|
import hashlib |
|
import itertools |
|
import logging |
|
import math |
|
from typing import Any, Callable, List, Union |
|
|
|
import numba |
|
import numpy as np |
|
from torch.utils.data import DistributedSampler, Sampler |
|
|
|
LOG = logging.getLogger("axolotl.utils.dataloader") |
|
|
|
|
|
@numba.njit |
|
def ffd_check(a: np.ndarray, c: int, n: int): |
|
|
|
|
|
|
|
|
|
a = np.sort(a)[::-1] |
|
bins = np.full((n,), c, dtype=a.dtype) |
|
for size in a: |
|
not_found = True |
|
for idx in range(n): |
|
if bins[idx] >= size: |
|
bins[idx] -= size |
|
not_found = False |
|
break |
|
|
|
if not_found: |
|
return False |
|
|
|
return True |
|
|
|
|
|
@numba.njit |
|
def ffd_with_result(a: np.ndarray, c: int, start_index: int): |
|
|
|
|
|
indices = np.argsort(a)[::-1] |
|
a = a[indices] |
|
|
|
bins: List[Any] = [] |
|
bins_result: List[Any] = [] |
|
for a_id, size in enumerate(a): |
|
add_new = True |
|
for idx in range(len(bins)): |
|
if bins[idx] >= size: |
|
bins[idx] -= size |
|
bins_result[idx].append(indices[a_id] + start_index) |
|
add_new = False |
|
break |
|
|
|
if add_new: |
|
bins.append(c - size) |
|
bins_result.append([indices[a_id] + start_index]) |
|
|
|
return bins_result, len(a) |
|
|
|
|
|
@numba.njit |
|
def allocate( |
|
lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int |
|
): |
|
""" |
|
:param lengths: array of lengths of each sample |
|
:param lengths_cumsum: cumulative sum of consecutive lengths |
|
:param rank: rank for this process |
|
:param c: length of tokens per batch |
|
:param n: number of ranks |
|
:return: |
|
""" |
|
|
|
|
|
|
|
|
|
s = 0 |
|
start_index = 0 |
|
result = [] |
|
result_totseqs = [] |
|
|
|
while True: |
|
|
|
left = 1 |
|
right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right") |
|
|
|
while right - left > 1: |
|
mid = (left + right) // 2 |
|
if ffd_check(lengths[start_index : start_index + mid], c, n): |
|
left = mid |
|
else: |
|
right = mid |
|
|
|
|
|
batch, tot_seqs = ffd_with_result( |
|
lengths[start_index : start_index + left], c, start_index |
|
) |
|
if len(batch) < n: |
|
break |
|
|
|
start_index += left |
|
s = lengths_cumsum[start_index - 1] |
|
|
|
|
|
result.append(batch[rank]) |
|
|
|
result_totseqs.append(tot_seqs) |
|
|
|
return result, result_totseqs, s, len(result) * c * n |
|
|
|
|
|
def chunk(iterable, n): |
|
""" |
|
Chunk data into tuples of length n |
|
""" |
|
|
|
if n < 1: |
|
raise ValueError("n must be at least one") |
|
it = iter(iterable) |
|
while batch := tuple(itertools.islice(it, n)): |
|
yield batch |
|
|
|
|
|
def hash_indices(lst: List[int]) -> str: |
|
|
|
concatenated = ",".join(map(str, lst)) |
|
|
|
|
|
sha256 = hashlib.sha256() |
|
sha256.update(concatenated.encode()) |
|
|
|
return sha256.hexdigest() |
|
|
|
|
|
class MultipackDistributedDataloader: |
|
"""Unpadded data loading using Multipack. |
|
Adapted from https://github.com/imoneoi/openchat/blob/v3_fix_mle_loss/ochat/training_deepspeed/multipack_dataloader.py |
|
Approximate (at most ~1.22x) the optimal solution of the identical-machines scheduling problem, which is NP-hard. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
dataset: Any, |
|
collate_fn: Callable, |
|
seq_max_length: int = 2048, |
|
batch_size: int = 1, |
|
sampler: Union[Sampler, DistributedSampler] = None, |
|
packing_efficiency_estimate: float = 1.0, |
|
sample_packing_seq_len_multiplier: int = 1, |
|
device_count: int = 1, |
|
): |
|
|
|
self.dataset = dataset |
|
self.lengths = ( |
|
dataset.data.column("position_ids") |
|
.to_pandas() |
|
.apply(lambda x: x[-1] + 1) |
|
.values |
|
) |
|
assert isinstance(self.lengths, np.ndarray) |
|
assert batch_size % sample_packing_seq_len_multiplier == 0 |
|
assert batch_size >= sample_packing_seq_len_multiplier |
|
self.sampler = sampler |
|
self.batch_size = batch_size |
|
self.sample_packing_seq_len_multiplier = sample_packing_seq_len_multiplier |
|
self.seq_max_length = seq_max_length |
|
self.batch_max_length = batch_size * seq_max_length |
|
self.collate_fn = collate_fn |
|
|
|
self.num_replicas = 1 |
|
self.rank = 0 |
|
|
|
|
|
self.eff_total_used = 0 |
|
self.eff_total_slots = 0 |
|
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0 |
|
self.device_count = device_count |
|
|
|
def generate_batches(self, set_stats=False): |
|
LOG.info("generating packed batches") |
|
if self.sampler: |
|
indices = [idx for idx in self.sampler] |
|
else: |
|
indices = range(0, len(self.dataset)) |
|
|
|
LOG.info(hash_indices(indices)) |
|
lengths = self.lengths[indices] |
|
lengths_cumsum = np.cumsum(lengths) |
|
|
|
batches, totseqs, total_used, total_slots = allocate( |
|
lengths=lengths, |
|
lengths_cumsum=lengths_cumsum, |
|
rank=self.rank, |
|
|
|
c=self.seq_max_length * self.sample_packing_seq_len_multiplier, |
|
n=self.num_replicas, |
|
) |
|
|
|
batches = [[indices[b_idx] for b_idx in batch] for batch in batches] |
|
|
|
|
|
if set_stats: |
|
self.eff_total_used += total_used |
|
self.eff_total_slots += total_slots |
|
|
|
return batches, totseqs |
|
|
|
def __iter__(self): |
|
if hasattr(self.sampler, "set_epoch"): |
|
new_epoch = self.sampler.epoch + 1 |
|
self.sampler.set_epoch(new_epoch) |
|
LOG.info(f"calling sampler.set_epoch({new_epoch})") |
|
all_batches, _ = self.generate_batches(set_stats=True) |
|
features = self.dataset.features.keys() |
|
len_remaining = self._len_est() |
|
for batches in chunk( |
|
all_batches, self.batch_size // self.sample_packing_seq_len_multiplier |
|
): |
|
chunked_data = [] |
|
attn_mask_cum_idx = 0 |
|
for batch in batches: |
|
concatenated = {} |
|
batched_data = [self.dataset[batch_idx] for batch_idx in batch] |
|
for feature in features: |
|
if feature == "attention_mask": |
|
arrays = [ |
|
(attn_mask_cum_idx + idx + 1) * np.array(item[feature]) |
|
for idx, item in enumerate(batched_data) |
|
if feature in item |
|
] |
|
attn_mask_cum_idx += len(batched_data) |
|
concatenated[feature] = np.concatenate(arrays) |
|
else: |
|
arrays = [ |
|
np.array(item[feature]) |
|
for item in batched_data |
|
if feature in item |
|
] |
|
concatenated[feature] = np.concatenate(arrays) |
|
chunked_data.append(concatenated) |
|
yield self.collate_fn(chunked_data) |
|
len_remaining -= 1 |
|
if not len_remaining: |
|
return |
|
|
|
for i in range(0, len_remaining): |
|
yield self.collate_fn( |
|
[ |
|
{ |
|
"input_ids": [0], |
|
"labels": [-100], |
|
"attention_mask": [True], |
|
"position_ids": [0], |
|
} |
|
] |
|
) |
|
|
|
def _len_est(self): |
|
lengths_sum = np.sum(self.lengths) |
|
lengths_sum_per_device = lengths_sum // self.device_count |
|
LOG.info( |
|
f"packing_efficiency_estimate: {self.packing_efficiency_estimate} " |
|
f"total_num_tokens per device: {lengths_sum_per_device}" |
|
) |
|
|
|
|
|
return ( |
|
math.floor( |
|
0.99 |
|
* lengths_sum_per_device |
|
/ self.packing_efficiency_estimate |
|
// self.seq_max_length |
|
// self.batch_size |
|
) |
|
- 1 |
|
) |
|
|
|
def __len__(self): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return max(1, self._len_est()) |
|
|
|
def len_w_stats(self): |
|
if not self.eff_total_used: |
|
batches, _ = self.generate_batches(set_stats=True) |
|
LOG.info( |
|
f"packing_efficiency_estimate: {self.packing_efficiency_estimate} " |
|
f"actual packing efficiency: {self.efficiency()}" |
|
) |
|
return max(1, self._len_est()) |
|
|
|
def efficiency(self): |
|
return self.eff_total_used / self.eff_total_slots |
|
|