recommend padding when using sample packing (#531)
Browse files- examples/code-llama/13b/lora.yml +1 -0
- examples/code-llama/13b/qlora.yml +1 -0
- examples/code-llama/34b/lora.yml +1 -0
- examples/code-llama/34b/qlora.yml +1 -0
- examples/code-llama/7b/lora.yml +1 -0
- examples/code-llama/7b/qlora.yml +1 -0
- examples/llama-2/lora.yml +1 -0
- examples/llama-2/qlora.yml +1 -0
- examples/llama-2/relora.yml +1 -0
- src/axolotl/utils/config.py +5 -0
- tests/test_validation.py +14 -0
examples/code-llama/13b/lora.yml
CHANGED
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
|
|
20 |
|
21 |
adapter: lora
|
22 |
lora_model_dir:
|
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
20 |
+
pad_to_sequence_len: true
|
21 |
|
22 |
adapter: lora
|
23 |
lora_model_dir:
|
examples/code-llama/13b/qlora.yml
CHANGED
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
|
|
23 |
|
24 |
lora_r: 32
|
25 |
lora_alpha: 16
|
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
23 |
+
pad_to_sequence_len: true
|
24 |
|
25 |
lora_r: 32
|
26 |
lora_alpha: 16
|
examples/code-llama/34b/lora.yml
CHANGED
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
|
|
20 |
|
21 |
adapter: lora
|
22 |
lora_model_dir:
|
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
20 |
+
pad_to_sequence_len: true
|
21 |
|
22 |
adapter: lora
|
23 |
lora_model_dir:
|
examples/code-llama/34b/qlora.yml
CHANGED
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
|
|
23 |
|
24 |
lora_r: 32
|
25 |
lora_alpha: 16
|
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
23 |
+
pad_to_sequence_len: true
|
24 |
|
25 |
lora_r: 32
|
26 |
lora_alpha: 16
|
examples/code-llama/7b/lora.yml
CHANGED
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
|
|
20 |
|
21 |
adapter: lora
|
22 |
lora_model_dir:
|
|
|
17 |
|
18 |
sequence_len: 100000
|
19 |
sample_packing: true
|
20 |
+
pad_to_sequence_len: true
|
21 |
|
22 |
adapter: lora
|
23 |
lora_model_dir:
|
examples/code-llama/7b/qlora.yml
CHANGED
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
|
|
23 |
|
24 |
lora_r: 32
|
25 |
lora_alpha: 16
|
|
|
20 |
|
21 |
sequence_len: 100000
|
22 |
sample_packing: true
|
23 |
+
pad_to_sequence_len: true
|
24 |
|
25 |
lora_r: 32
|
26 |
lora_alpha: 16
|
examples/llama-2/lora.yml
CHANGED
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
17 |
|
18 |
sequence_len: 4096
|
19 |
sample_packing: true
|
|
|
20 |
|
21 |
adapter: lora
|
22 |
lora_model_dir:
|
|
|
17 |
|
18 |
sequence_len: 4096
|
19 |
sample_packing: true
|
20 |
+
pad_to_sequence_len: true
|
21 |
|
22 |
adapter: lora
|
23 |
lora_model_dir:
|
examples/llama-2/qlora.yml
CHANGED
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
20 |
|
21 |
sequence_len: 4096
|
22 |
sample_packing: true
|
|
|
23 |
|
24 |
lora_r: 32
|
25 |
lora_alpha: 16
|
|
|
20 |
|
21 |
sequence_len: 4096
|
22 |
sample_packing: true
|
23 |
+
pad_to_sequence_len: true
|
24 |
|
25 |
lora_r: 32
|
26 |
lora_alpha: 16
|
examples/llama-2/relora.yml
CHANGED
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
20 |
|
21 |
sequence_len: 4096
|
22 |
sample_packing: true
|
|
|
23 |
|
24 |
lora_r: 8
|
25 |
lora_alpha: 16
|
|
|
20 |
|
21 |
sequence_len: 4096
|
22 |
sample_packing: true
|
23 |
+
pad_to_sequence_len: true
|
24 |
|
25 |
lora_r: 8
|
26 |
lora_alpha: 16
|
src/axolotl/utils/config.py
CHANGED
@@ -97,6 +97,11 @@ def validate_config(cfg):
|
|
97 |
)
|
98 |
)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
101 |
raise ValueError(
|
102 |
"please set only one of gradient_accumulation_steps or batch_size"
|
|
|
97 |
)
|
98 |
)
|
99 |
|
100 |
+
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
101 |
+
LOG.warning(
|
102 |
+
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
103 |
+
)
|
104 |
+
|
105 |
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
106 |
raise ValueError(
|
107 |
"please set only one of gradient_accumulation_steps or batch_size"
|
tests/test_validation.py
CHANGED
@@ -328,6 +328,20 @@ class ValidationTest(unittest.TestCase):
|
|
328 |
for record in self._caplog.records
|
329 |
)
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
cfg = DictDefault(
|
332 |
{
|
333 |
"max_packed_sequence_len": 2048,
|
|
|
328 |
for record in self._caplog.records
|
329 |
)
|
330 |
|
331 |
+
cfg = DictDefault(
|
332 |
+
{
|
333 |
+
"sample_packing": True,
|
334 |
+
"pad_to_sequence_len": None,
|
335 |
+
}
|
336 |
+
)
|
337 |
+
with self._caplog.at_level(logging.WARNING):
|
338 |
+
validate_config(cfg)
|
339 |
+
assert any(
|
340 |
+
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
341 |
+
in record.message
|
342 |
+
for record in self._caplog.records
|
343 |
+
)
|
344 |
+
|
345 |
cfg = DictDefault(
|
346 |
{
|
347 |
"max_packed_sequence_len": 2048,
|