optimize calculation of cu_seqlens from position_ids (#1084) [skip ci]
Browse files
src/axolotl/monkeypatch/utils.py
CHANGED
@@ -55,6 +55,7 @@ def get_cu_seqlens(attn_mask):
|
|
55 |
return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
|
56 |
|
57 |
|
|
|
58 |
def get_cu_seqlens_from_pos_ids(position_ids):
|
59 |
"""generate a cumulative sequence length mask for flash attention using pos ids"""
|
60 |
if len(position_ids.shape) == 1:
|
@@ -81,7 +82,7 @@ def get_cu_seqlens_from_pos_ids(position_ids):
|
|
81 |
# Get the indices where the sequence starts
|
82 |
start_indices = torch.cat(
|
83 |
[
|
84 |
-
(seq_starts).
|
85 |
torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
|
86 |
]
|
87 |
)
|
|
|
55 |
return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
|
56 |
|
57 |
|
58 |
+
@torch.jit.script
|
59 |
def get_cu_seqlens_from_pos_ids(position_ids):
|
60 |
"""generate a cumulative sequence length mask for flash attention using pos ids"""
|
61 |
if len(position_ids.shape) == 1:
|
|
|
82 |
# Get the indices where the sequence starts
|
83 |
start_indices = torch.cat(
|
84 |
[
|
85 |
+
torch.nonzero(seq_starts).unbind(dim=1)[0],
|
86 |
torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
|
87 |
]
|
88 |
)
|