Spaces:

Oopstom
/

ReactSeq

Sleeping

File size: 54,367 Bytes

c668e80

""" Implementation of all available options """
import configargparse

from onmt.modules.sru import CheckSRU
from onmt.transforms import AVAILABLE_TRANSFORMS
from onmt.constants import ModelTask
from onmt.modules.position_ffn import ACTIVATION_FUNCTIONS
from onmt.modules.position_ffn import ActivationFunction
from onmt.constants import DefaultTokens


def config_opts(parser):
    group = parser.add_argument_group("Configuration")
    group.add(
        "-config",
        "--config",
        required=False,
        is_config_file_arg=True,
        help="Path of the main YAML config file.",
    )
    group.add(
        "-save_config",
        "--save_config",
        required=False,
        is_write_out_config_file_arg=True,
        help="Path where to save the config.",
    )


def _add_logging_opts(parser, is_train=True):
    group = parser.add_argument_group("Logging")
    group.add(
        "--log_file",
        "-log_file",
        type=str,
        default="",
        help="Output logs to a file under this path.",
    )
    group.add(
        "--log_file_level",
        "-log_file_level",
        type=str,
        action=StoreLoggingLevelAction,
        choices=StoreLoggingLevelAction.CHOICES,
        default="0",
    )
    group.add(
        "--verbose",
        "-verbose",
        action="store_true",
        help="Print data loading and statistics for all process"
        "(default only log the first process shard)"
        if is_train
        else "Print scores and predictions for each sentence",
    )

    if is_train:
        group.add(
            "--valid_metrics",
            "-valid_metrics",
            default=[],
            nargs="+",
            help="List of names of additional validation metrics",
        )
        group.add(
            "--scoring_debug",
            "-scoring_debug",
            action="store_true",
            help="Dump the src/ref/pred of the current batch",
        )
        group.add(
            "--dump_preds",
            "-dump_preds",
            type=str,
            default=None,
            help="Folder to dump predictions to.",
        )
        group.add(
            "--report_every",
            "-report_every",
            type=int,
            default=50,
            help="Print stats at this interval.",
        )
        group.add(
            "--exp_host",
            "-exp_host",
            type=str,
            default="",
            help="Send logs to this crayon server.",
        )
        group.add(
            "--exp",
            "-exp",
            type=str,
            default="",
            help="Name of the experiment for logging.",
        )
        # Use Tensorboard for visualization during training
        group.add(
            "--tensorboard",
            "-tensorboard",
            action="store_true",
            help="Use tensorboard for visualization during training. "
            "Must have the library tensorboard >= 1.14.",
        )
        group.add(
            "--tensorboard_log_dir",
            "-tensorboard_log_dir",
            type=str,
            default="runs/onmt",
            help="Log directory for Tensorboard. " "This is also the name of the run.",
        )
        group.add(
            "--override_opts",
            "-override-opts",
            action="store_true",
            help="Allow to override some checkpoint opts",
        )
    else:
        # Options only during inference
        group.add(
            "--attn_debug",
            "-attn_debug",
            action="store_true",
            help="Print best attn for each word",
        )
        group.add(
            "--align_debug",
            "-align_debug",
            action="store_true",
            help="Print best align for each word",
        )
        group.add(
            "--dump_beam",
            "-dump_beam",
            type=str,
            default="",
            help="File to dump beam information to.",
        )
        group.add(
            "--n_best",
            "-n_best",
            type=int,
            default=1,
            help="If verbose is set, will output the n_best " "decoded sentences",
        )
        group.add(
            "--with_score",
            "-with_score",
            action="store_true",
            help="add a tab separated score to the translation",
        )


def _add_reproducibility_opts(parser):
    group = parser.add_argument_group("Reproducibility")
    group.add(
        "--seed",
        "-seed",
        type=int,
        default=-1,
        help="Set random seed used for better " "reproducibility between experiments.",
    )


def _add_dynamic_corpus_opts(parser, build_vocab_only=False):
    """Options related to training corpus, type: a list of dictionary."""
    group = parser.add_argument_group("Data")
    group.add(
        "-data",
        "--data",
        required=True,
        help="List of datasets and their specifications. "
        "See examples/*.yaml for further details.",
    )
    group.add(
        "-skip_empty_level",
        "--skip_empty_level",
        default="warning",
        choices=["silent", "warning", "error"],
        help="Security level when encounter empty examples."
        "silent: silently ignore/skip empty example;"
        "warning: warning when ignore/skip empty example;"
        "error: raise error & stop execution when encouter empty.",
    )
    group.add(
        "-transforms",
        "--transforms",
        default=[],
        nargs="+",
        choices=AVAILABLE_TRANSFORMS.keys(),
        help="Default transform pipeline to apply to data. "
        "Can be specified in each corpus of data to override.",
    )

    group.add(
        "-save_data",
        "--save_data",
        required=build_vocab_only,
        help="Output base path for objects that will "
        "be saved (vocab, transforms, embeddings, ...).",
    )
    group.add(
        "-overwrite",
        "--overwrite",
        action="store_true",
        help="Overwrite existing objects if any.",
    )
    group.add(
        "-n_sample",
        "--n_sample",
        type=int,
        default=(5000 if build_vocab_only else 0),
        help=("Build vocab using " if build_vocab_only else "Stop after save ")
        + "this number of transformed samples/corpus. Can be [-1, 0, N>0]. "
        "Set to -1 to go full corpus, 0 to skip.",
    )

    if not build_vocab_only:
        group.add(
            "-dump_transforms",
            "--dump_transforms",
            action="store_true",
            help="Dump transforms `*.transforms.pt` to disk."
            " -save_data should be set as saving prefix.",
        )
    else:
        group.add(
            "-dump_samples",
            "--dump_samples",
            action="store_true",
            help="Dump samples when building vocab. "
            "Warning: this may slow down the process.",
        )
        group.add(
            "-num_threads",
            "--num_threads",
            type=int,
            default=1,
            help="Number of parallel threads to build the vocab.",
        )
        group.add(
            "-learn_subwords",
            "--learn_subwords",
            action="store_true",
            help="Learn subwords prior to building vocab",
        )
        group.add(
            "-learn_subwords_size",
            "--learn_subwords_size",
            type=int,
            default=32000,
            help="Learn subwords operations",
        )
        group.add(
            "-vocab_sample_queue_size",
            "--vocab_sample_queue_size",
            type=int,
            default=20,
            help="Size of queues used in the build_vocab dump path.",
        )


def _add_features_opts(parser):
    group = parser.add_argument_group("Features")
    group.add(
        "-n_src_feats",
        "--n_src_feats",
        type=int,
        default=0,
        help="Number of source feats.",
    )
    group.add(
        "-src_feats_defaults",
        "--src_feats_defaults",
        help="Default features to apply in source in case " "there are not annotated",
    )


def _add_dynamic_vocab_opts(parser, build_vocab_only=False):
    """Options related to vocabulary and features.

    Add all options relate to vocabulary or features to parser.
    """
    group = parser.add_argument_group("Vocab")
    group.add(
        "-src_vocab",
        "--src_vocab",
        required=True,
        help=("Path to save" if build_vocab_only else "Path to")
        + " src (or shared) vocabulary file. "
        "Format: one <word> or <word>\t<count> per line.",
    )
    group.add(
        "-tgt_vocab",
        "--tgt_vocab",
        help=("Path to save" if build_vocab_only else "Path to")
        + " tgt vocabulary file. "
        "Format: one <word> or <word>\t<count> per line.",
    )
    group.add(
        "-share_vocab",
        "--share_vocab",
        action="store_true",
        help="Share source and target vocabulary.",
    )
    group.add(
        "--decoder_start_token",
        "-decoder_start_token",
        type=str,
        default=DefaultTokens.BOS,
        help="Default decoder start token "
        "for most ONMT models it is <s> = BOS "
        "it happens that for some Fairseq model it requires </s> ",
    )
    group.add(
        "--default_specials",
        "-default_specials",
        nargs="+",
        type=str,
        default=[
            DefaultTokens.UNK,
            DefaultTokens.PAD,
            DefaultTokens.BOS,
            DefaultTokens.EOS,
        ],
        help="default specials used for Vocab initialization"
        " UNK, PAD, BOS, EOS will take IDs 0, 1, 2, 3 "
        " typically <unk> <blank> <s> </s> ",
    )

    _add_features_opts(parser)

    if not build_vocab_only:
        group.add(
            "-src_vocab_size",
            "--src_vocab_size",
            type=int,
            default=32768,
            help="Maximum size of the source vocabulary.",
        )
        group.add(
            "-tgt_vocab_size",
            "--tgt_vocab_size",
            type=int,
            default=32768,
            help="Maximum size of the target vocabulary",
        )
        group.add(
            "-vocab_size_multiple",
            "--vocab_size_multiple",
            type=int,
            default=8,
            help="Make the vocabulary size a multiple of this value.",
        )

        group.add(
            "-src_words_min_frequency",
            "--src_words_min_frequency",
            type=int,
            default=0,
            help="Discard source words with lower frequency.",
        )
        group.add(
            "-tgt_words_min_frequency",
            "--tgt_words_min_frequency",
            type=int,
            default=0,
            help="Discard target words with lower frequency.",
        )

        # Truncation options, for text corpus
        group = parser.add_argument_group("Pruning")
        group.add(
            "--src_seq_length_trunc",
            "-src_seq_length_trunc",
            type=int,
            default=None,
            help="Truncate source sequence length.",
        )
        group.add(
            "--tgt_seq_length_trunc",
            "-tgt_seq_length_trunc",
            type=int,
            default=None,
            help="Truncate target sequence length.",
        )

        group = parser.add_argument_group("Embeddings")
        group.add(
            "-both_embeddings",
            "--both_embeddings",
            help="Path to the embeddings file to use "
            "for both source and target tokens.",
        )
        group.add(
            "-src_embeddings",
            "--src_embeddings",
            help="Path to the embeddings file to use for source tokens.",
        )
        group.add(
            "-tgt_embeddings",
            "--tgt_embeddings",
            help="Path to the embeddings file to use for target tokens.",
        )
        group.add(
            "-embeddings_type",
            "--embeddings_type",
            choices=["GloVe", "word2vec"],
            help="Type of embeddings file.",
        )


def _add_dynamic_transform_opts(parser):
    """Options related to transforms.

    Options that specified in the definitions of each transform class
    at `onmt/transforms/*.py`.
    """
    for name, transform_cls in AVAILABLE_TRANSFORMS.items():
        transform_cls.add_options(parser)


def dynamic_prepare_opts(parser, build_vocab_only=False):
    """Options related to data prepare in dynamic mode.

    Add all dynamic data prepare related options to parser.
    If `build_vocab_only` set to True, then only contains options that
    will be used in `onmt/bin/build_vocab.py`.
    """
    config_opts(parser)
    _add_dynamic_corpus_opts(parser, build_vocab_only=build_vocab_only)
    _add_dynamic_vocab_opts(parser, build_vocab_only=build_vocab_only)
    _add_dynamic_transform_opts(parser)

    if build_vocab_only:
        _add_reproducibility_opts(parser)
        # as for False, this will be added in _add_train_general_opts


def distributed_opts(parser):
    # GPU
    group = parser.add_argument_group("Distributed")
    group.add(
        "--gpu_ranks",
        "-gpu_ranks",
        default=[],
        nargs="*",
        type=int,
        help="list of ranks of each process.",
    )
    group.add(
        "--world_size",
        "-world_size",
        default=1,
        type=int,
        help="total number of distributed processes.",
    )
    group.add(
        "--parallel_mode",
        "-parallel_mode",
        default="data_parallel",
        choices=["tensor_parallel", "data_parallel"],
        type=str,
        help="Distributed mode.",
    )
    group.add(
        "--gpu_backend",
        "-gpu_backend",
        default="nccl",
        type=str,
        help="Type of torch distributed backend",
    )
    group.add(
        "--gpu_verbose_level",
        "-gpu_verbose_level",
        default=0,
        type=int,
        help="Gives more info on each process per GPU.",
    )
    group.add(
        "--master_ip",
        "-master_ip",
        default="localhost",
        type=str,
        help="IP of master for torch.distributed training.",
    )
    group.add(
        "--master_port",
        "-master_port",
        default=10000,
        type=int,
        help="Port of master for torch.distributed training.",
    )


def model_opts(parser):
    """
    These options are passed to the construction of the model.
    Be careful with these as they will be used during translation.
    """

    # Embedding Options
    group = parser.add_argument_group("Model-Embeddings")
    group.add(
        "--src_word_vec_size",
        "-src_word_vec_size",
        type=int,
        default=500,
        help="Word embedding size for src.",
    )
    group.add(
        "--tgt_word_vec_size",
        "-tgt_word_vec_size",
        type=int,
        default=500,
        help="Word embedding size for tgt.",
    )
    group.add(
        "--word_vec_size",
        "-word_vec_size",
        type=int,
        default=-1,
        help="Word embedding size for src and tgt.",
    )

    group.add(
        "--share_decoder_embeddings",
        "-share_decoder_embeddings",
        action="store_true",
        help="Use a shared weight matrix for the input and "
        "output word  embeddings in the decoder.",
    )
    group.add(
        "--share_embeddings",
        "-share_embeddings",
        action="store_true",
        help="Share the word embeddings between encoder "
        "and decoder. Need to use shared dictionary for this "
        "option.",
    )
    group.add(
        "--position_encoding",
        "-position_encoding",
        action="store_true",
        help="Use a sin to mark relative words positions. "
        "Necessary for non-RNN style models.",
    )
    group.add(
        "--position_encoding_type",
        "-position_encoding_type",
        type=str,
        default="SinusoidalInterleaved",
        choices=["SinusoidalInterleaved", "SinusoidalConcat"],
        help="Type of positional encoding. At the moment: "
        "Sinusoidal fixed, Interleaved or Concat",
    )

    group.add(
        "-update_vocab",
        "--update_vocab",
        action="store_true",
        help="Update source and target existing vocabularies",
    )

    group = parser.add_argument_group("Model-Embedding Features")
    group.add(
        "--feat_merge",
        "-feat_merge",
        type=str,
        default="concat",
        choices=["concat", "sum", "mlp"],
        help="Merge action for incorporating features embeddings. "
        "Options [concat|sum|mlp].",
    )
    group.add(
        "--feat_vec_size",
        "-feat_vec_size",
        type=int,
        default=-1,
        help="If specified, feature embedding sizes "
        "will be set to this. Otherwise, feat_vec_exponent "
        "will be used.",
    )
    group.add(
        "--feat_vec_exponent",
        "-feat_vec_exponent",
        type=float,
        default=0.7,
        help="If -feat_merge_size is not set, feature "
        "embedding sizes will be set to N^feat_vec_exponent "
        "where N is the number of values the feature takes.",
    )

    # Model Task Options
    group = parser.add_argument_group("Model- Task")
    group.add(
        "-model_task",
        "--model_task",
        default=ModelTask.SEQ2SEQ,
        choices=[ModelTask.SEQ2SEQ, ModelTask.LANGUAGE_MODEL],
        help="Type of task for the model either seq2seq or lm",
    )

    # Encoder-Decoder Options
    group = parser.add_argument_group("Model- Encoder-Decoder")
    group.add(
        "--model_type",
        "-model_type",
        default="text",
        choices=["text"],
        help="Type of source model to use. Allows "
        "the system to incorporate non-text inputs. "
        "Options are [text].",
    )
    group.add(
        "--model_dtype",
        "-model_dtype",
        default="fp32",
        choices=["fp32", "fp16"],
        help="Data type of the model.",
    )

    group.add(
        "--encoder_type",
        "-encoder_type",
        type=str,
        default="rnn",
        help="Type of encoder layer to use. Non-RNN layers "
        "are experimental. Default options are "
        "[rnn|brnn|ggnn|mean|transformer|cnn|transformer_lm].",
    )
    group.add(
        "--decoder_type",
        "-decoder_type",
        type=str,
        default="rnn",
        help="Type of decoder layer to use. Non-RNN layers "
        "are experimental. Default options are "
        "[rnn|transformer|cnn|transformer].",
    )

    # Freeze Encoder and/or Decoder
    group.add(
        "--freeze_encoder",
        "-freeze_encoder",
        action="store_true",
        help="Freeze parameters in encoder.",
    )
    group.add(
        "--freeze_decoder",
        "-freeze_decoder",
        action="store_true",
        help="Freeze parameters in decoder.",
    )

    group.add(
        "--layers", "-layers", type=int, default=-1, help="Number of layers in enc/dec."
    )
    group.add(
        "--enc_layers",
        "-enc_layers",
        type=int,
        default=2,
        help="Number of layers in the encoder",
    )
    group.add(
        "--dec_layers",
        "-dec_layers",
        type=int,
        default=2,
        help="Number of layers in the decoder",
    )
    group.add(
        "--hidden_size",
        "-hidden_size",
        type=int,
        default=-1,
        help="Size of rnn hidden states. Overwrites " "enc_hid_size and dec_hid_size",
    )
    group.add(
        "--enc_hid_size",
        "-enc_hid_size",
        type=int,
        default=500,
        help="Size of encoder rnn hidden states.",
    )
    group.add(
        "--dec_hid_size",
        "-dec_hid_size",
        type=int,
        default=500,
        help="Size of decoder rnn hidden states.",
    )
    group.add(
        "--cnn_kernel_width",
        "-cnn_kernel_width",
        type=int,
        default=3,
        help="Size of windows in the cnn, the kernel_size is "
        "(cnn_kernel_width, 1) in conv layer",
    )

    group.add(
        "--layer_norm",
        "-layer_norm",
        type=str,
        default="standard",
        choices=["standard", "rms"],
        help="The type of layer"
        " normalization in the transformer architecture. Choices are"
        " standard or rms. Default to standard",
    )
    group.add(
        "--norm_eps", "-norm_eps", type=float, default=1e-6, help="Layer norm epsilon"
    )

    group.add(
        "--pos_ffn_activation_fn",
        "-pos_ffn_activation_fn",
        type=str,
        default=ActivationFunction.relu,
        choices=ACTIVATION_FUNCTIONS.keys(),
        help="The activation"
        " function to use in PositionwiseFeedForward layer. Choices are"
        f" {ACTIVATION_FUNCTIONS.keys()}. Default to"
        f" {ActivationFunction.relu}.",
    )

    group.add(
        "--input_feed",
        "-input_feed",
        type=int,
        default=1,
        help="Feed the context vector at each time step as "
        "additional input (via concatenation with the word "
        "embeddings) to the decoder.",
    )
    group.add(
        "--bridge",
        "-bridge",
        action="store_true",
        help="Have an additional layer between the last encoder "
        "state and the first decoder state",
    )
    group.add(
        "--rnn_type",
        "-rnn_type",
        type=str,
        default="LSTM",
        choices=["LSTM", "GRU", "SRU"],
        action=CheckSRU,
        help="The gate type to use in the RNNs",
    )
    group.add(
        "--context_gate",
        "-context_gate",
        type=str,
        default=None,
        choices=["source", "target", "both"],
        help="Type of context gate to use. " "Do not select for no context gate.",
    )

    # The following options (bridge_extra_node to n_steps) are used
    # for training with --encoder_type ggnn (Gated Graph Neural Network).
    group.add(
        "--bridge_extra_node",
        "-bridge_extra_node",
        type=bool,
        default=True,
        help="Graph encoder bridges only extra node to decoder as input",
    )
    group.add(
        "--bidir_edges",
        "-bidir_edges",
        type=bool,
        default=True,
        help="Graph encoder autogenerates bidirectional edges",
    )
    group.add(
        "--state_dim",
        "-state_dim",
        type=int,
        default=512,
        help="Number of state dimensions in the graph encoder",
    )
    group.add(
        "--n_edge_types",
        "-n_edge_types",
        type=int,
        default=2,
        help="Number of edge types in the graph encoder",
    )
    group.add(
        "--n_node",
        "-n_node",
        type=int,
        default=2,
        help="Number of nodes in the graph encoder",
    )
    group.add(
        "--n_steps",
        "-n_steps",
        type=int,
        default=2,
        help="Number of steps to advance graph encoder",
    )
    group.add(
        "--src_ggnn_size",
        "-src_ggnn_size",
        type=int,
        default=0,
        help="Vocab size plus feature space for embedding input",
    )

    # Attention options
    group = parser.add_argument_group("Model- Attention")
    group.add(
        "--global_attention",
        "-global_attention",
        type=str,
        default="general",
        choices=["dot", "general", "mlp", "none"],
        help="The attention type to use: "
        "dotprod or general (Luong) or MLP (Bahdanau)",
    )
    group.add(
        "--global_attention_function",
        "-global_attention_function",
        type=str,
        default="softmax",
        choices=["softmax", "sparsemax"],
    )
    group.add(
        "--self_attn_type",
        "-self_attn_type",
        type=str,
        default="scaled-dot",
        help="Self attention type in Transformer decoder "
        'layer -- currently "scaled-dot" or "average" ',
    )
    group.add(
        "--max_relative_positions",
        "-max_relative_positions",
        type=int,
        default=0,
        help="This setting enable relative position encoding"
        "We support two types of encodings:"
        "set this -1 to enable Rotary Embeddings"
        "more info: https://arxiv.org/abs/2104.09864"
        "set this to > 0 (ex: 16, 32) to use"
        "Maximum distance between inputs in relative "
        "positions representations. "
        "more info: https://arxiv.org/pdf/1803.02155.pdf",
    )
    group.add(
        "--relative_positions_buckets",
        "-relative_positions_buckets",
        type=int,
        default=0,
        help="This setting enable relative position bias"
        "more info: https://github.com/google-research/text-to-text-transfer-transformer",
    )
    group.add(
        "--heads",
        "-heads",
        type=int,
        default=8,
        help="Number of heads for transformer self-attention",
    )
    group.add(
        "--transformer_ff",
        "-transformer_ff",
        type=int,
        default=2048,
        help="Size of hidden transformer feed-forward",
    )
    group.add(
        "--aan_useffn",
        "-aan_useffn",
        action="store_true",
        help="Turn on the FFN layer in the AAN decoder",
    )
    group.add(
        "--add_qkvbias",
        "-add_qkvbias",
        action="store_true",
        help="Add bias to nn.linear of Query/Key/Value in MHA"
        "Note: this will add bias to output proj layer too",
    )
    group.add(
        "--multiquery",
        "-multiquery",
        action="store_true",
        help="Use MultiQuery attention" "Note: https://arxiv.org/pdf/1911.02150.pdf",
    )
    group.add(
        "--num_kv",
        "-num_kv",
        type=int,
        default=0,
        help="Number of heads for KV in the variant of MultiQuery attention (egs: Falcon 40B)",
    )
    group.add(
        "--add_ffnbias",
        "-add_ffnbias",
        action="store_true",
        help="Add bias to nn.linear of Position_wise FFN",
    )
    group.add(
        "--parallel_residual",
        "-parallel_residual",
        action="store_true",
        help="Use Parallel residual in Decoder Layer"
        "Note: this is used by GPT-J / Falcon Architecture",
    )
    group.add(
        "--shared_layer_norm",
        "-shared_layer_norm",
        action="store_true",
        help="Use a shared layer_norm in parallel residual attention"
        "Note: must be true for Falcon 7B / false for Falcon 40B",
    )
    # Alignement options
    group = parser.add_argument_group("Model - Alignement")
    group.add(
        "--lambda_align",
        "-lambda_align",
        type=float,
        default=0.0,
        help="Lambda value for alignement loss of Garg et al (2019)"
        "For more detailed information, see: "
        "https://arxiv.org/abs/1909.02074",
    )
    group.add(
        "--alignment_layer",
        "-alignment_layer",
        type=int,
        default=-3,
        help="Layer number which has to be supervised.",
    )
    group.add(
        "--alignment_heads",
        "-alignment_heads",
        type=int,
        default=0,
        help="N. of cross attention heads per layer to supervised with",
    )
    group.add(
        "--full_context_alignment",
        "-full_context_alignment",
        action="store_true",
        help="Whether alignment is conditioned on full target context.",
    )

    # Generator and loss options.
    group = parser.add_argument_group("Generator")
    group.add(
        "--copy_attn",
        "-copy_attn",
        action="store_true",
        help="Train copy attention layer.",
    )
    group.add(
        "--copy_attn_type",
        "-copy_attn_type",
        type=str,
        default=None,
        choices=["dot", "general", "mlp", "none"],
        help="The copy attention type to use. Leave as None to use "
        "the same as -global_attention.",
    )
    group.add(
        "--generator_function",
        "-generator_function",
        default="softmax",
        choices=["softmax", "sparsemax"],
        help="Which function to use for generating "
        "probabilities over the target vocabulary (choices: "
        "softmax, sparsemax)",
    )
    group.add(
        "--copy_attn_force",
        "-copy_attn_force",
        action="store_true",
        help="When available, train to copy.",
    )
    group.add(
        "--reuse_copy_attn",
        "-reuse_copy_attn",
        action="store_true",
        help="Reuse standard attention for copy",
    )
    group.add(
        "--copy_loss_by_seqlength",
        "-copy_loss_by_seqlength",
        action="store_true",
        help="Divide copy loss by length of sequence",
    )
    group.add(
        "--coverage_attn",
        "-coverage_attn",
        action="store_true",
        help="Train a coverage attention layer.",
    )
    group.add(
        "--lambda_coverage",
        "-lambda_coverage",
        type=float,
        default=0.0,
        help="Lambda value for coverage loss of See et al (2017)",
    )
    group.add(
        "--lm_prior_model",
        "-lm_prior_model",
        type=str,
        default=None,
        help="LM model to used to train the TM",
    )
    group.add(
        "--lm_prior_lambda",
        "-lambda_prior_lambda",
        type=float,
        default=0.0,
        help="LM Prior Lambda",
    )
    group.add(
        "--lm_prior_tau",
        "-lambda_prior_tau",
        type=float,
        default=1.0,
        help="LM Prior Tau",
    )
    group.add(
        "--loss_scale",
        "-loss_scale",
        type=float,
        default=0,
        help="For FP16 training, the static loss scale to use. If not "
        "set, the loss scale is dynamically computed.",
    )
    group.add(
        "--apex_opt_level",
        "-apex_opt_level",
        type=str,
        default="",
        choices=["", "O0", "O1", "O2", "O3"],
        help="For FP16 training, the opt_level to use."
        "See https://nvidia.github.io/apex/amp.html#opt-levels.",
    )
    group.add(
        "--zero_out_prompt_loss",
        "-zero_out_prompt_loss",
        action="store_true",
        help="Set the prompt loss to zero."
        "Mostly for LLM finetuning."
        "Will be enabled only if the `insert_mask_before_placeholder` transform is applied",
    )
    group.add(
        "--use_ckpting",
        "-use_ckpting",
        default=[],
        nargs="+",
        choices=["ffn", "mha", "lora"],
        type=str,
        help="use gradient checkpointing those modules",
    )


def _add_train_general_opts(parser):
    """General options for training"""
    group = parser.add_argument_group("General")
    group.add(
        "--data_type",
        "-data_type",
        default="text",
        help="Type of the source input. " "Options are [text].",
    )

    group.add(
        "--save_model",
        "-save_model",
        default="model",
        help="Model filename (the model will be saved as "
        "<save_model>_N.pt where N is the number "
        "of steps",
    )

    group.add(
        "--save_format",
        "-save_format",
        default="pytorch",
        choices=["pytorch", "safetensors"],
        help="Format to save the model weights",
    )

    group.add(
        "--save_checkpoint_steps",
        "-save_checkpoint_steps",
        type=int,
        default=5000,
        help="""Save a checkpoint every X steps""",
    )
    group.add(
        "--keep_checkpoint",
        "-keep_checkpoint",
        type=int,
        default=-1,
        help="Keep X checkpoints (negative: keep all)",
    )

    # LoRa
    group.add(
        "--lora_layers",
        "-lora_layers",
        default=[],
        nargs="+",
        type=str,
        help="list of layers to be replaced by LoRa layers."
        " ex: ['linear_values', 'linear_query'] "
        " cf paper §4.2 https://arxiv.org/abs/2106.09685",
    )
    group.add(
        "--lora_embedding",
        "-lora_embedding",
        action="store_true",
        help="replace embeddings with LoRa Embeddings see §5.1",
    )
    group.add(
        "--lora_rank",
        "-lora_rank",
        type=int,
        default=2,
        help="r=2 successfully tested with NLLB-200 3.3B",
    )
    group.add(
        "--lora_alpha",
        "-lora_alpha",
        type=int,
        default=1,
        help="§4.1 https://arxiv.org/abs/2106.09685",
    )
    group.add(
        "--lora_dropout",
        "-lora_dropout",
        type=float,
        default=0.0,
        help="rule of thumb: same value as in main model",
    )

    _add_reproducibility_opts(parser)

    # Init options
    group = parser.add_argument_group("Initialization")
    group.add(
        "--param_init",
        "-param_init",
        type=float,
        default=0.1,
        help="Parameters are initialized over uniform distribution "
        "with support (-param_init, param_init). "
        "Use 0 to not use initialization",
    )
    group.add(
        "--param_init_glorot",
        "-param_init_glorot",
        action="store_true",
        help="Init parameters with xavier_uniform. " "Required for transformer.",
    )

    group.add(
        "--train_from",
        "-train_from",
        default="",
        type=str,
        help="If training from a checkpoint then this is the "
        "path to the pretrained model's state_dict.",
    )
    group.add(
        "--reset_optim",
        "-reset_optim",
        default="none",
        choices=["none", "all", "states", "keep_states"],
        help="Optimization resetter when train_from.",
    )

    # Pretrained word vectors
    group.add(
        "--pre_word_vecs_enc",
        "-pre_word_vecs_enc",
        help="If a valid path is specified, then this will load "
        "pretrained word embeddings on the encoder side. "
        "See README for specific formatting instructions.",
    )
    group.add(
        "--pre_word_vecs_dec",
        "-pre_word_vecs_dec",
        help="If a valid path is specified, then this will load "
        "pretrained word embeddings on the decoder side. "
        "See README for specific formatting instructions.",
    )
    # Freeze word vectors
    group.add(
        "--freeze_word_vecs_enc",
        "-freeze_word_vecs_enc",
        action="store_true",
        help="Freeze word embeddings on the encoder side.",
    )
    group.add(
        "--freeze_word_vecs_dec",
        "-freeze_word_vecs_dec",
        action="store_true",
        help="Freeze word embeddings on the decoder side.",
    )

    # Optimization options
    group = parser.add_argument_group("Optimization- Type")
    group.add(
        "--num_workers",
        "-num_workers",
        type=int,
        default=2,
        help="pytorch DataLoader num_workers",
    )
    group.add(
        "--batch_size",
        "-batch_size",
        type=int,
        default=64,
        help="Maximum batch size for training",
    )
    group.add(
        "--batch_size_multiple",
        "-batch_size_multiple",
        type=int,
        default=1,
        help="Batch size multiple for token batches.",
    )
    group.add(
        "--batch_type",
        "-batch_type",
        default="sents",
        choices=["sents", "tokens"],
        help="Batch grouping for batch_size. Standard "
        "is sents. Tokens will do dynamic batching",
    )
    group.add(
        "--normalization",
        "-normalization",
        default="sents",
        choices=["sents", "tokens"],
        help="Normalization method of the gradient.",
    )
    group.add(
        "--accum_count",
        "-accum_count",
        type=int,
        nargs="+",
        default=[1],
        help="Accumulate gradient this many times. "
        "Approximately equivalent to updating "
        "batch_size * accum_count batches at once. "
        "Recommended for Transformer.",
    )
    group.add(
        "--accum_steps",
        "-accum_steps",
        type=int,
        nargs="+",
        default=[0],
        help="Steps at which accum_count values change",
    )
    group.add(
        "--valid_steps",
        "-valid_steps",
        type=int,
        default=10000,
        help="Perfom validation every X steps",
    )
    group.add(
        "--valid_batch_size",
        "-valid_batch_size",
        type=int,
        default=32,
        help="Maximum batch size for validation",
    )
    group.add(
        "--train_steps",
        "-train_steps",
        type=int,
        default=100000,
        help="Number of training steps",
    )
    group.add(
        "--single_pass",
        "-single_pass",
        action="store_true",
        help="Make a single pass over the training dataset.",
    )
    group.add(
        "--early_stopping",
        "-early_stopping",
        type=int,
        default=0,
        help="Number of validation steps without improving.",
    )
    group.add(
        "--early_stopping_criteria",
        "-early_stopping_criteria",
        nargs="*",
        default=None,
        help="Criteria to use for early stopping.",
    )
    group.add(
        "--optim",
        "-optim",
        default="sgd",
        choices=[
            "sgd",
            "adagrad",
            "adadelta",
            "adam",
            "sparseadam",
            "adafactor",
            "fusedadam",
            "adamw8bit",
            "pagedadamw8bit",
            "pagedadamw32bit",
        ],
        help="Optimization method.",
    )
    group.add(
        "--adagrad_accumulator_init",
        "-adagrad_accumulator_init",
        type=float,
        default=0,
        help="Initializes the accumulator values in adagrad. "
        "Mirrors the initial_accumulator_value option "
        "in the tensorflow adagrad (use 0.1 for their default).",
    )
    group.add(
        "--max_grad_norm",
        "-max_grad_norm",
        type=float,
        default=5,
        help="If the norm of the gradient vector exceeds this, "
        "renormalize it to have the norm equal to "
        "max_grad_norm",
    )
    group.add(
        "--dropout",
        "-dropout",
        type=float,
        default=[0.3],
        nargs="+",
        help="Dropout probability; applied in LSTM stacks.",
    )
    group.add(
        "--attention_dropout",
        "-attention_dropout",
        type=float,
        default=[0.1],
        nargs="+",
        help="Attention Dropout probability.",
    )
    group.add(
        "--dropout_steps",
        "-dropout_steps",
        type=int,
        nargs="+",
        default=[0],
        help="Steps at which dropout changes.",
    )
    group.add(
        "--truncated_decoder",
        "-truncated_decoder",
        type=int,
        default=0,
        help="""Truncated bptt.""",
    )
    group.add(
        "--adam_beta1",
        "-adam_beta1",
        type=float,
        default=0.9,
        help="The beta1 parameter used by Adam. "
        "Almost without exception a value of 0.9 is used in "
        "the literature, seemingly giving good results, "
        "so we would discourage changing this value from "
        "the default without due consideration.",
    )
    group.add(
        "--adam_beta2",
        "-adam_beta2",
        type=float,
        default=0.999,
        help="The beta2 parameter used by Adam. "
        "Typically a value of 0.999 is recommended, as this is "
        "the value suggested by the original paper describing "
        "Adam, and is also the value adopted in other frameworks "
        "such as Tensorflow and Keras, i.e. see: "
        "https://www.tensorflow.org/api_docs/python/tf/train/Adam"
        "Optimizer or https://keras.io/optimizers/ . "
        'Whereas recently the paper "Attention is All You Need" '
        "suggested a value of 0.98 for beta2, this parameter may "
        "not work well for normal models / default "
        "baselines.",
    )
    group.add(
        "--label_smoothing",
        "-label_smoothing",
        type=float,
        default=0.0,
        help="Label smoothing value epsilon. "
        "Probabilities of all non-true labels "
        "will be smoothed by epsilon / (vocab_size - 1). "
        "Set to zero to turn off label smoothing. "
        "For more detailed information, see: "
        "https://arxiv.org/abs/1512.00567",
    )
    group.add(
        "--average_decay",
        "-average_decay",
        type=float,
        default=0,
        help="Moving average decay. "
        "Set to other than 0 (e.g. 1e-4) to activate. "
        "Similar to Marian NMT implementation: "
        "http://www.aclweb.org/anthology/P18-4020 "
        "For more detail on Exponential Moving Average: "
        "https://en.wikipedia.org/wiki/Moving_average",
    )
    group.add(
        "--average_every",
        "-average_every",
        type=int,
        default=1,
        help="Step for moving average. "
        "Default is every update, "
        "if -average_decay is set.",
    )

    # learning rate
    group = parser.add_argument_group("Optimization- Rate")
    group.add(
        "--learning_rate",
        "-learning_rate",
        type=float,
        default=1.0,
        help="Starting learning rate. "
        "Recommended settings: sgd = 1, adagrad = 0.1, "
        "adadelta = 1, adam = 0.001",
    )
    group.add(
        "--learning_rate_decay",
        "-learning_rate_decay",
        type=float,
        default=0.5,
        help="If update_learning_rate, decay learning rate by "
        "this much if steps have gone past "
        "start_decay_steps",
    )
    group.add(
        "--start_decay_steps",
        "-start_decay_steps",
        type=int,
        default=50000,
        help="Start decaying every decay_steps after " "start_decay_steps",
    )
    group.add(
        "--decay_steps",
        "-decay_steps",
        type=int,
        default=10000,
        help="Decay every decay_steps",
    )

    group.add(
        "--decay_method",
        "-decay_method",
        type=str,
        default="none",
        choices=["noam", "noamwd", "rsqrt", "none"],
        help="Use a custom decay rate.",
    )
    group.add(
        "--warmup_steps",
        "-warmup_steps",
        type=int,
        default=4000,
        help="Number of warmup steps for custom decay.",
    )
    _add_logging_opts(parser, is_train=True)


def _add_train_dynamic_data(parser):
    group = parser.add_argument_group("Dynamic data")
    group.add(
        "-bucket_size",
        "--bucket_size",
        type=int,
        default=262144,
        help="""A bucket is a buffer of bucket_size examples to pick
                   from the various Corpora. The dynamic iterator batches
                   batch_size batchs from the bucket and shuffle them.""",
    )
    group.add(
        "-bucket_size_init",
        "--bucket_size_init",
        type=int,
        default=-1,
        help="""The bucket is initalized with this awith this
               amount of examples (optional)""",
    )
    group.add(
        "-bucket_size_increment",
        "--bucket_size_increment",
        type=int,
        default=0,
        help="""The bucket size is incremented with this
              amount of examples (optional)""",
    )
    group.add(
        "-prefetch_factor",
        "--prefetch_factor",
        type=int,
        default=200,
        help="""number of mini-batches loaded in advance to avoid the
                   GPU waiting during the refilling of the bucket.""",
    )


def _add_quant_opts(parser):
    group = parser.add_argument_group("Quant options")
    group.add(
        "--quant_layers",
        "-quant_layers",
        default=[],
        nargs="+",
        type=str,
        help="list of layers to be compressed in 4/8bit.",
    )

    group.add(
        "--quant_type",
        "-quant_type",
        default="bnb_8bit",
        choices=["bnb_8bit", "bnb_FP4", "bnb_NF4"],
        type=str,
        help="Type of compression.",
    )


def train_opts(parser):
    """All options used in train."""
    # options relate to data preprare
    dynamic_prepare_opts(parser, build_vocab_only=False)
    distributed_opts(parser)
    # options relate to train
    model_opts(parser)
    _add_train_general_opts(parser)
    _add_train_dynamic_data(parser)
    _add_quant_opts(parser)


def _add_decoding_opts(parser):
    group = parser.add_argument_group("Beam Search")
    beam_size = group.add(
        "--beam_size", "-beam_size", type=int, default=5, help="Beam size"
    )
    group.add(
        "--ratio",
        "-ratio",
        type=float,
        default=-0.0,
        help="Ratio based beam stop condition",
    )

    group = parser.add_argument_group("Random Sampling")
    group.add(
        "--random_sampling_topk",
        "-random_sampling_topk",
        default=0,
        type=int,
        help="Set this to -1 to do random sampling from full "
        "distribution. Set this to value k>1 to do random "
        "sampling restricted to the k most likely next tokens. "
        "Set this to 1 to use argmax.",
    )
    group.add(
        "--random_sampling_topp",
        "-random_sampling_topp",
        default=0.0,
        type=float,
        help="Probability for top-p/nucleus sampling. Restrict tokens"
        " to the most likely until the cumulated probability is"
        " over p. In range [0, 1]."
        " https://arxiv.org/abs/1904.09751",
    )
    group.add(
        "--random_sampling_temp",
        "-random_sampling_temp",
        default=1.0,
        type=float,
        help="If doing random sampling, divide the logits by "
        "this before computing softmax during decoding.",
    )
    group._group_actions.append(beam_size)
    _add_reproducibility_opts(parser)

    group = parser.add_argument_group(
        "Penalties", ".. Note:: Coverage Penalty is not available in sampling."
    )
    # Alpha and Beta values for Google Length + Coverage penalty
    # Described here: https://arxiv.org/pdf/1609.08144.pdf, Section 7
    # Length penalty options
    group.add(
        "--length_penalty",
        "-length_penalty",
        default="avg",
        choices=["none", "wu", "avg"],
        help="Length Penalty to use.",
    )
    group.add(
        "--alpha",
        "-alpha",
        type=float,
        default=1.0,
        help="Length penalty parameter" "(higher = longer generation)",
    )
    # Coverage penalty options
    group.add(
        "--coverage_penalty",
        "-coverage_penalty",
        default="none",
        choices=["none", "wu", "summary"],
        help="Coverage Penalty to use. Only available in beam search.",
    )
    group.add(
        "--beta", "-beta", type=float, default=-0.0, help="Coverage penalty parameter"
    )
    group.add(
        "--stepwise_penalty",
        "-stepwise_penalty",
        action="store_true",
        help="Apply coverage penalty at every decoding step. "
        "Helpful for summary penalty.",
    )

    group = parser.add_argument_group(
        "Decoding tricks",
        ".. Tip:: Following options can be used to limit the decoding length "
        "or content.",
    )
    # Decoding Length constraint
    group.add(
        "--min_length",
        "-min_length",
        type=int,
        default=0,
        help="Minimum prediction length",
    )
    group.add(
        "--max_length",
        "-max_length",
        type=int,
        default=250,
        help="Maximum prediction length.",
    )
    # Decoding content constraint
    group.add(
        "--block_ngram_repeat",
        "-block_ngram_repeat",
        type=int,
        default=0,
        help="Block repetition of ngrams during decoding.",
    )
    group.add(
        "--ignore_when_blocking",
        "-ignore_when_blocking",
        nargs="+",
        type=str,
        default=[],
        help="Ignore these strings when blocking repeats. "
        "You want to block sentence delimiters.",
    )
    group.add(
        "--replace_unk",
        "-replace_unk",
        action="store_true",
        help="Replace the generated UNK tokens with the "
        "source token that had highest attention weight. If "
        "phrase_table is provided, it will look up the "
        "identified source token and give the corresponding "
        "target token. If it is not provided (or the identified "
        "source token does not exist in the table), then it "
        "will copy the source token.",
    )
    group.add(
        "--ban_unk_token",
        "-ban_unk_token",
        action="store_true",
        help="Prevent unk token generation by setting unk proba to 0",
    )
    group.add(
        "--phrase_table",
        "-phrase_table",
        type=str,
        default="",
        help="If phrase_table is provided (with replace_unk), it will "
        "look up the identified source token and give the "
        "corresponding target token. If it is not provided "
        "(or the identified source token does not exist in "
        "the table), then it will copy the source token.",
    )


def translate_opts(parser, dynamic=False):
    """Translation / inference options"""
    group = parser.add_argument_group("Model")
    group.add(
        "--model",
        "-model",
        dest="models",
        metavar="MODEL",
        nargs="+",
        type=str,
        default=[],
        required=True,
        help="Path to model .pt file(s). "
        "Multiple models can be specified, "
        "for ensemble decoding.",
    )
    group.add(
        "--precision",
        "-precision",
        default="",
        choices=["", "fp32", "fp16", "int8"],
        help="Precision to run inference."
        "default is model.dtype"
        "fp32 to force slow FP16 model on GTX1080"
        "int8 enables pytorch native 8-bit quantization"
        "(cpu only)",
    )
    group.add(
        "--fp32",
        "-fp32",
        action=DeprecateAction,
        help="Deprecated use 'precision' instead",
    )
    group.add(
        "--int8",
        "-int8",
        action=DeprecateAction,
        help="Deprecated use 'precision' instead",
    )
    group.add(
        "--avg_raw_probs",
        "-avg_raw_probs",
        action="store_true",
        help="If this is set, during ensembling scores from "
        "different models will be combined by averaging their "
        "raw probabilities and then taking the log. Otherwise, "
        "the log probabilities will be averaged directly. "
        "Necessary for models whose output layers can assign "
        "zero probability.",
    )

    group = parser.add_argument_group("Data")
    group.add(
        "--data_type",
        "-data_type",
        default="text",
        help="Type of the source input. Options: [text].",
    )

    group.add(
        "--src",
        "-src",
        required=True,
        help="Source sequence to decode (one line per " "sequence)",
    )
    group.add("--tgt", "-tgt", help="True target sequence (optional)")
    group.add(
        "--tgt_file_prefix",
        "-tgt_file_prefix",
        action="store_true",
        help="Generate predictions using provided `-tgt` as prefix.",
    )
    group.add(
        "--output",
        "-output",
        default="pred.txt",
        help="Path to output the predictions (each line will "
        "be the decoded sequence",
    )
    group.add(
        "--report_align",
        "-report_align",
        action="store_true",
        help="Report alignment for each translation.",
    )
    group.add(
        "--gold_align",
        "-gold_align",
        action="store_true",
        help="Report alignment between source and gold target."
        "Useful to test the performance of learnt alignments.",
    )
    group.add(
        "--report_time",
        "-report_time",
        action="store_true",
        help="Report some translation time metrics",
    )

    # Adding options related to source and target features
    _add_features_opts(parser)

    # Adding options relate to decoding strategy
    _add_decoding_opts(parser)

    # Adding option for logging
    _add_logging_opts(parser, is_train=False)

    distributed_opts(parser)

    group = parser.add_argument_group("Efficiency")
    group.add("--batch_size", "-batch_size", type=int, default=30, help="Batch size")
    group.add(
        "--batch_type",
        "-batch_type",
        default="sents",
        choices=["sents", "tokens"],
        help="Batch grouping for batch_size. Standard "
        "is sents. Tokens will do dynamic batching",
    )
    group.add("--gpu", "-gpu", type=int, default=-1, help="Device to run on")

    if dynamic:
        group.add(
            "-transforms",
            "--transforms",
            default=[],
            nargs="+",
            choices=AVAILABLE_TRANSFORMS.keys(),
            help="Default transform pipeline to apply to data.",
        )

        # Adding options related to Transforms
        _add_dynamic_transform_opts(parser)

    _add_quant_opts(parser)


# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.


class StoreLoggingLevelAction(configargparse.Action):
    """Convert string to logging level"""

    import logging

    LEVELS = {
        "CRITICAL": logging.CRITICAL,
        "ERROR": logging.ERROR,
        "WARNING": logging.WARNING,
        "INFO": logging.INFO,
        "DEBUG": logging.DEBUG,
        "NOTSET": logging.NOTSET,
    }

    CHOICES = list(LEVELS.keys()) + [str(_) for _ in LEVELS.values()]

    def __init__(self, option_strings, dest, help=None, **kwargs):
        super(StoreLoggingLevelAction, self).__init__(
            option_strings, dest, help=help, **kwargs
        )

    def __call__(self, parser, namespace, value, option_string=None):
        # Get the key 'value' in the dict, or just use 'value'
        level = StoreLoggingLevelAction.LEVELS.get(value, value)
        setattr(namespace, self.dest, level)


class DeprecateAction(configargparse.Action):
    """Deprecate action"""

    def __init__(self, option_strings, dest, help=None, **kwargs):
        super(DeprecateAction, self).__init__(
            option_strings, dest, nargs=0, help=help, **kwargs
        )

    def __call__(self, parser, namespace, values, flag_name):
        help = self.help if self.help is not None else ""
        msg = "Flag '%s' is deprecated. %s" % (flag_name, help)
        raise configargparse.ArgumentTypeError(msg)