File size: 2,038 Bytes
e50ab07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bce45f
e50ab07
 
 
 
98b4762
e50ab07
 
 
 
 
 
 
 
7570446
e50ab07
 
 
2202a20
e50ab07
 
d66b101
98b4762
 
 
 
 
badda37
 
98b4762
e50ab07
 
 
 
 
 
 
 
 
 
5bce45f
 
 
 
 
e50ab07
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
CLI to run training on a model
"""
import logging
from pathlib import Path

import fire
import transformers
from colorama import Fore

from axolotl.cli import (
    check_accelerate_default_config,
    check_user_token,
    load_cfg,
    load_datasets,
    load_rl_datasets,
    print_axolotl_text_art,
)
from axolotl.common.cli import PreprocessCliArgs
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.prompt_strategies.sharegpt import register_chatml_template

LOG = logging.getLogger("axolotl.cli.preprocess")


def do_cli(config: Path = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parsed_cfg = load_cfg(config, **kwargs)
    parsed_cfg.is_preprocess = True
    check_accelerate_default_config()
    check_user_token()
    parser = transformers.HfArgumentParser((PreprocessCliArgs))
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )

    if parsed_cfg.chat_template == "chatml" and parsed_cfg.default_system_message:
        LOG.info(
            f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
        )
        register_chatml_template(parsed_cfg.default_system_message)
    else:
        register_chatml_template()

    if not parsed_cfg.dataset_prepared_path:
        msg = (
            Fore.RED
            + "preprocess CLI called without dataset_prepared_path set, "
            + f"using default path: {DEFAULT_DATASET_PREPARED_PATH}"
            + Fore.RESET
        )
        LOG.warning(msg)
        parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

    if parsed_cfg.rl:
        load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
    else:
        load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)

    LOG.info(
        Fore.GREEN
        + f"Success! Preprocessed data path: `dataset_prepared_path: {parsed_cfg.dataset_prepared_path}`"
        + Fore.RESET
    )


if __name__ == "__main__":
    fire.Fire(do_cli)