""" Implementation of all available options """ import configargparse from onmt.modules.sru import CheckSRU from onmt.transforms import AVAILABLE_TRANSFORMS from onmt.constants import ModelTask from onmt.modules.position_ffn import ACTIVATION_FUNCTIONS from onmt.modules.position_ffn import ActivationFunction from onmt.constants import DefaultTokens def config_opts(parser): group = parser.add_argument_group("Configuration") group.add( "-config", "--config", required=False, is_config_file_arg=True, help="Path of the main YAML config file.", ) group.add( "-save_config", "--save_config", required=False, is_write_out_config_file_arg=True, help="Path where to save the config.", ) def _add_logging_opts(parser, is_train=True): group = parser.add_argument_group("Logging") group.add( "--log_file", "-log_file", type=str, default="", help="Output logs to a file under this path.", ) group.add( "--log_file_level", "-log_file_level", type=str, action=StoreLoggingLevelAction, choices=StoreLoggingLevelAction.CHOICES, default="0", ) group.add( "--verbose", "-verbose", action="store_true", help="Print data loading and statistics for all process" "(default only log the first process shard)" if is_train else "Print scores and predictions for each sentence", ) if is_train: group.add( "--valid_metrics", "-valid_metrics", default=[], nargs="+", help="List of names of additional validation metrics", ) group.add( "--scoring_debug", "-scoring_debug", action="store_true", help="Dump the src/ref/pred of the current batch", ) group.add( "--dump_preds", "-dump_preds", type=str, default=None, help="Folder to dump predictions to.", ) group.add( "--report_every", "-report_every", type=int, default=50, help="Print stats at this interval.", ) group.add( "--exp_host", "-exp_host", type=str, default="", help="Send logs to this crayon server.", ) group.add( "--exp", "-exp", type=str, default="", help="Name of the experiment for logging.", ) # Use Tensorboard for visualization during training group.add( "--tensorboard", "-tensorboard", action="store_true", help="Use tensorboard for visualization during training. " "Must have the library tensorboard >= 1.14.", ) group.add( "--tensorboard_log_dir", "-tensorboard_log_dir", type=str, default="runs/onmt", help="Log directory for Tensorboard. " "This is also the name of the run.", ) group.add( "--override_opts", "-override-opts", action="store_true", help="Allow to override some checkpoint opts", ) else: # Options only during inference group.add( "--attn_debug", "-attn_debug", action="store_true", help="Print best attn for each word", ) group.add( "--align_debug", "-align_debug", action="store_true", help="Print best align for each word", ) group.add( "--dump_beam", "-dump_beam", type=str, default="", help="File to dump beam information to.", ) group.add( "--n_best", "-n_best", type=int, default=1, help="If verbose is set, will output the n_best " "decoded sentences", ) group.add( "--with_score", "-with_score", action="store_true", help="add a tab separated score to the translation", ) def _add_reproducibility_opts(parser): group = parser.add_argument_group("Reproducibility") group.add( "--seed", "-seed", type=int, default=-1, help="Set random seed used for better " "reproducibility between experiments.", ) def _add_dynamic_corpus_opts(parser, build_vocab_only=False): """Options related to training corpus, type: a list of dictionary.""" group = parser.add_argument_group("Data") group.add( "-data", "--data", required=True, help="List of datasets and their specifications. " "See examples/*.yaml for further details.", ) group.add( "-skip_empty_level", "--skip_empty_level", default="warning", choices=["silent", "warning", "error"], help="Security level when encounter empty examples." "silent: silently ignore/skip empty example;" "warning: warning when ignore/skip empty example;" "error: raise error & stop execution when encouter empty.", ) group.add( "-transforms", "--transforms", default=[], nargs="+", choices=AVAILABLE_TRANSFORMS.keys(), help="Default transform pipeline to apply to data. " "Can be specified in each corpus of data to override.", ) group.add( "-save_data", "--save_data", required=build_vocab_only, help="Output base path for objects that will " "be saved (vocab, transforms, embeddings, ...).", ) group.add( "-overwrite", "--overwrite", action="store_true", help="Overwrite existing objects if any.", ) group.add( "-n_sample", "--n_sample", type=int, default=(5000 if build_vocab_only else 0), help=("Build vocab using " if build_vocab_only else "Stop after save ") + "this number of transformed samples/corpus. Can be [-1, 0, N>0]. " "Set to -1 to go full corpus, 0 to skip.", ) if not build_vocab_only: group.add( "-dump_transforms", "--dump_transforms", action="store_true", help="Dump transforms `*.transforms.pt` to disk." " -save_data should be set as saving prefix.", ) else: group.add( "-dump_samples", "--dump_samples", action="store_true", help="Dump samples when building vocab. " "Warning: this may slow down the process.", ) group.add( "-num_threads", "--num_threads", type=int, default=1, help="Number of parallel threads to build the vocab.", ) group.add( "-learn_subwords", "--learn_subwords", action="store_true", help="Learn subwords prior to building vocab", ) group.add( "-learn_subwords_size", "--learn_subwords_size", type=int, default=32000, help="Learn subwords operations", ) group.add( "-vocab_sample_queue_size", "--vocab_sample_queue_size", type=int, default=20, help="Size of queues used in the build_vocab dump path.", ) def _add_features_opts(parser): group = parser.add_argument_group("Features") group.add( "-n_src_feats", "--n_src_feats", type=int, default=0, help="Number of source feats.", ) group.add( "-src_feats_defaults", "--src_feats_defaults", help="Default features to apply in source in case " "there are not annotated", ) def _add_dynamic_vocab_opts(parser, build_vocab_only=False): """Options related to vocabulary and features. Add all options relate to vocabulary or features to parser. """ group = parser.add_argument_group("Vocab") group.add( "-src_vocab", "--src_vocab", required=True, help=("Path to save" if build_vocab_only else "Path to") + " src (or shared) vocabulary file. " "Format: one or \t per line.", ) group.add( "-tgt_vocab", "--tgt_vocab", help=("Path to save" if build_vocab_only else "Path to") + " tgt vocabulary file. " "Format: one or \t per line.", ) group.add( "-share_vocab", "--share_vocab", action="store_true", help="Share source and target vocabulary.", ) group.add( "--decoder_start_token", "-decoder_start_token", type=str, default=DefaultTokens.BOS, help="Default decoder start token " "for most ONMT models it is = BOS " "it happens that for some Fairseq model it requires ", ) group.add( "--default_specials", "-default_specials", nargs="+", type=str, default=[ DefaultTokens.UNK, DefaultTokens.PAD, DefaultTokens.BOS, DefaultTokens.EOS, ], help="default specials used for Vocab initialization" " UNK, PAD, BOS, EOS will take IDs 0, 1, 2, 3 " " typically ", ) _add_features_opts(parser) if not build_vocab_only: group.add( "-src_vocab_size", "--src_vocab_size", type=int, default=32768, help="Maximum size of the source vocabulary.", ) group.add( "-tgt_vocab_size", "--tgt_vocab_size", type=int, default=32768, help="Maximum size of the target vocabulary", ) group.add( "-vocab_size_multiple", "--vocab_size_multiple", type=int, default=8, help="Make the vocabulary size a multiple of this value.", ) group.add( "-src_words_min_frequency", "--src_words_min_frequency", type=int, default=0, help="Discard source words with lower frequency.", ) group.add( "-tgt_words_min_frequency", "--tgt_words_min_frequency", type=int, default=0, help="Discard target words with lower frequency.", ) # Truncation options, for text corpus group = parser.add_argument_group("Pruning") group.add( "--src_seq_length_trunc", "-src_seq_length_trunc", type=int, default=None, help="Truncate source sequence length.", ) group.add( "--tgt_seq_length_trunc", "-tgt_seq_length_trunc", type=int, default=None, help="Truncate target sequence length.", ) group = parser.add_argument_group("Embeddings") group.add( "-both_embeddings", "--both_embeddings", help="Path to the embeddings file to use " "for both source and target tokens.", ) group.add( "-src_embeddings", "--src_embeddings", help="Path to the embeddings file to use for source tokens.", ) group.add( "-tgt_embeddings", "--tgt_embeddings", help="Path to the embeddings file to use for target tokens.", ) group.add( "-embeddings_type", "--embeddings_type", choices=["GloVe", "word2vec"], help="Type of embeddings file.", ) def _add_dynamic_transform_opts(parser): """Options related to transforms. Options that specified in the definitions of each transform class at `onmt/transforms/*.py`. """ for name, transform_cls in AVAILABLE_TRANSFORMS.items(): transform_cls.add_options(parser) def dynamic_prepare_opts(parser, build_vocab_only=False): """Options related to data prepare in dynamic mode. Add all dynamic data prepare related options to parser. If `build_vocab_only` set to True, then only contains options that will be used in `onmt/bin/build_vocab.py`. """ config_opts(parser) _add_dynamic_corpus_opts(parser, build_vocab_only=build_vocab_only) _add_dynamic_vocab_opts(parser, build_vocab_only=build_vocab_only) _add_dynamic_transform_opts(parser) if build_vocab_only: _add_reproducibility_opts(parser) # as for False, this will be added in _add_train_general_opts def distributed_opts(parser): # GPU group = parser.add_argument_group("Distributed") group.add( "--gpu_ranks", "-gpu_ranks", default=[], nargs="*", type=int, help="list of ranks of each process.", ) group.add( "--world_size", "-world_size", default=1, type=int, help="total number of distributed processes.", ) group.add( "--parallel_mode", "-parallel_mode", default="data_parallel", choices=["tensor_parallel", "data_parallel"], type=str, help="Distributed mode.", ) group.add( "--gpu_backend", "-gpu_backend", default="nccl", type=str, help="Type of torch distributed backend", ) group.add( "--gpu_verbose_level", "-gpu_verbose_level", default=0, type=int, help="Gives more info on each process per GPU.", ) group.add( "--master_ip", "-master_ip", default="localhost", type=str, help="IP of master for torch.distributed training.", ) group.add( "--master_port", "-master_port", default=10000, type=int, help="Port of master for torch.distributed training.", ) def model_opts(parser): """ These options are passed to the construction of the model. Be careful with these as they will be used during translation. """ # Embedding Options group = parser.add_argument_group("Model-Embeddings") group.add( "--src_word_vec_size", "-src_word_vec_size", type=int, default=500, help="Word embedding size for src.", ) group.add( "--tgt_word_vec_size", "-tgt_word_vec_size", type=int, default=500, help="Word embedding size for tgt.", ) group.add( "--word_vec_size", "-word_vec_size", type=int, default=-1, help="Word embedding size for src and tgt.", ) group.add( "--share_decoder_embeddings", "-share_decoder_embeddings", action="store_true", help="Use a shared weight matrix for the input and " "output word embeddings in the decoder.", ) group.add( "--share_embeddings", "-share_embeddings", action="store_true", help="Share the word embeddings between encoder " "and decoder. Need to use shared dictionary for this " "option.", ) group.add( "--position_encoding", "-position_encoding", action="store_true", help="Use a sin to mark relative words positions. " "Necessary for non-RNN style models.", ) group.add( "--position_encoding_type", "-position_encoding_type", type=str, default="SinusoidalInterleaved", choices=["SinusoidalInterleaved", "SinusoidalConcat"], help="Type of positional encoding. At the moment: " "Sinusoidal fixed, Interleaved or Concat", ) group.add( "-update_vocab", "--update_vocab", action="store_true", help="Update source and target existing vocabularies", ) group = parser.add_argument_group("Model-Embedding Features") group.add( "--feat_merge", "-feat_merge", type=str, default="concat", choices=["concat", "sum", "mlp"], help="Merge action for incorporating features embeddings. " "Options [concat|sum|mlp].", ) group.add( "--feat_vec_size", "-feat_vec_size", type=int, default=-1, help="If specified, feature embedding sizes " "will be set to this. Otherwise, feat_vec_exponent " "will be used.", ) group.add( "--feat_vec_exponent", "-feat_vec_exponent", type=float, default=0.7, help="If -feat_merge_size is not set, feature " "embedding sizes will be set to N^feat_vec_exponent " "where N is the number of values the feature takes.", ) # Model Task Options group = parser.add_argument_group("Model- Task") group.add( "-model_task", "--model_task", default=ModelTask.SEQ2SEQ, choices=[ModelTask.SEQ2SEQ, ModelTask.LANGUAGE_MODEL], help="Type of task for the model either seq2seq or lm", ) # Encoder-Decoder Options group = parser.add_argument_group("Model- Encoder-Decoder") group.add( "--model_type", "-model_type", default="text", choices=["text"], help="Type of source model to use. Allows " "the system to incorporate non-text inputs. " "Options are [text].", ) group.add( "--model_dtype", "-model_dtype", default="fp32", choices=["fp32", "fp16"], help="Data type of the model.", ) group.add( "--encoder_type", "-encoder_type", type=str, default="rnn", help="Type of encoder layer to use. Non-RNN layers " "are experimental. Default options are " "[rnn|brnn|ggnn|mean|transformer|cnn|transformer_lm].", ) group.add( "--decoder_type", "-decoder_type", type=str, default="rnn", help="Type of decoder layer to use. Non-RNN layers " "are experimental. Default options are " "[rnn|transformer|cnn|transformer].", ) # Freeze Encoder and/or Decoder group.add( "--freeze_encoder", "-freeze_encoder", action="store_true", help="Freeze parameters in encoder.", ) group.add( "--freeze_decoder", "-freeze_decoder", action="store_true", help="Freeze parameters in decoder.", ) group.add( "--layers", "-layers", type=int, default=-1, help="Number of layers in enc/dec." ) group.add( "--enc_layers", "-enc_layers", type=int, default=2, help="Number of layers in the encoder", ) group.add( "--dec_layers", "-dec_layers", type=int, default=2, help="Number of layers in the decoder", ) group.add( "--hidden_size", "-hidden_size", type=int, default=-1, help="Size of rnn hidden states. Overwrites " "enc_hid_size and dec_hid_size", ) group.add( "--enc_hid_size", "-enc_hid_size", type=int, default=500, help="Size of encoder rnn hidden states.", ) group.add( "--dec_hid_size", "-dec_hid_size", type=int, default=500, help="Size of decoder rnn hidden states.", ) group.add( "--cnn_kernel_width", "-cnn_kernel_width", type=int, default=3, help="Size of windows in the cnn, the kernel_size is " "(cnn_kernel_width, 1) in conv layer", ) group.add( "--layer_norm", "-layer_norm", type=str, default="standard", choices=["standard", "rms"], help="The type of layer" " normalization in the transformer architecture. Choices are" " standard or rms. Default to standard", ) group.add( "--norm_eps", "-norm_eps", type=float, default=1e-6, help="Layer norm epsilon" ) group.add( "--pos_ffn_activation_fn", "-pos_ffn_activation_fn", type=str, default=ActivationFunction.relu, choices=ACTIVATION_FUNCTIONS.keys(), help="The activation" " function to use in PositionwiseFeedForward layer. Choices are" f" {ACTIVATION_FUNCTIONS.keys()}. Default to" f" {ActivationFunction.relu}.", ) group.add( "--input_feed", "-input_feed", type=int, default=1, help="Feed the context vector at each time step as " "additional input (via concatenation with the word " "embeddings) to the decoder.", ) group.add( "--bridge", "-bridge", action="store_true", help="Have an additional layer between the last encoder " "state and the first decoder state", ) group.add( "--rnn_type", "-rnn_type", type=str, default="LSTM", choices=["LSTM", "GRU", "SRU"], action=CheckSRU, help="The gate type to use in the RNNs", ) group.add( "--context_gate", "-context_gate", type=str, default=None, choices=["source", "target", "both"], help="Type of context gate to use. " "Do not select for no context gate.", ) # The following options (bridge_extra_node to n_steps) are used # for training with --encoder_type ggnn (Gated Graph Neural Network). group.add( "--bridge_extra_node", "-bridge_extra_node", type=bool, default=True, help="Graph encoder bridges only extra node to decoder as input", ) group.add( "--bidir_edges", "-bidir_edges", type=bool, default=True, help="Graph encoder autogenerates bidirectional edges", ) group.add( "--state_dim", "-state_dim", type=int, default=512, help="Number of state dimensions in the graph encoder", ) group.add( "--n_edge_types", "-n_edge_types", type=int, default=2, help="Number of edge types in the graph encoder", ) group.add( "--n_node", "-n_node", type=int, default=2, help="Number of nodes in the graph encoder", ) group.add( "--n_steps", "-n_steps", type=int, default=2, help="Number of steps to advance graph encoder", ) group.add( "--src_ggnn_size", "-src_ggnn_size", type=int, default=0, help="Vocab size plus feature space for embedding input", ) # Attention options group = parser.add_argument_group("Model- Attention") group.add( "--global_attention", "-global_attention", type=str, default="general", choices=["dot", "general", "mlp", "none"], help="The attention type to use: " "dotprod or general (Luong) or MLP (Bahdanau)", ) group.add( "--global_attention_function", "-global_attention_function", type=str, default="softmax", choices=["softmax", "sparsemax"], ) group.add( "--self_attn_type", "-self_attn_type", type=str, default="scaled-dot", help="Self attention type in Transformer decoder " 'layer -- currently "scaled-dot" or "average" ', ) group.add( "--max_relative_positions", "-max_relative_positions", type=int, default=0, help="This setting enable relative position encoding" "We support two types of encodings:" "set this -1 to enable Rotary Embeddings" "more info: https://arxiv.org/abs/2104.09864" "set this to > 0 (ex: 16, 32) to use" "Maximum distance between inputs in relative " "positions representations. " "more info: https://arxiv.org/pdf/1803.02155.pdf", ) group.add( "--relative_positions_buckets", "-relative_positions_buckets", type=int, default=0, help="This setting enable relative position bias" "more info: https://github.com/google-research/text-to-text-transfer-transformer", ) group.add( "--heads", "-heads", type=int, default=8, help="Number of heads for transformer self-attention", ) group.add( "--transformer_ff", "-transformer_ff", type=int, default=2048, help="Size of hidden transformer feed-forward", ) group.add( "--aan_useffn", "-aan_useffn", action="store_true", help="Turn on the FFN layer in the AAN decoder", ) group.add( "--add_qkvbias", "-add_qkvbias", action="store_true", help="Add bias to nn.linear of Query/Key/Value in MHA" "Note: this will add bias to output proj layer too", ) group.add( "--multiquery", "-multiquery", action="store_true", help="Use MultiQuery attention" "Note: https://arxiv.org/pdf/1911.02150.pdf", ) group.add( "--num_kv", "-num_kv", type=int, default=0, help="Number of heads for KV in the variant of MultiQuery attention (egs: Falcon 40B)", ) group.add( "--add_ffnbias", "-add_ffnbias", action="store_true", help="Add bias to nn.linear of Position_wise FFN", ) group.add( "--parallel_residual", "-parallel_residual", action="store_true", help="Use Parallel residual in Decoder Layer" "Note: this is used by GPT-J / Falcon Architecture", ) group.add( "--shared_layer_norm", "-shared_layer_norm", action="store_true", help="Use a shared layer_norm in parallel residual attention" "Note: must be true for Falcon 7B / false for Falcon 40B", ) # Alignement options group = parser.add_argument_group("Model - Alignement") group.add( "--lambda_align", "-lambda_align", type=float, default=0.0, help="Lambda value for alignement loss of Garg et al (2019)" "For more detailed information, see: " "https://arxiv.org/abs/1909.02074", ) group.add( "--alignment_layer", "-alignment_layer", type=int, default=-3, help="Layer number which has to be supervised.", ) group.add( "--alignment_heads", "-alignment_heads", type=int, default=0, help="N. of cross attention heads per layer to supervised with", ) group.add( "--full_context_alignment", "-full_context_alignment", action="store_true", help="Whether alignment is conditioned on full target context.", ) # Generator and loss options. group = parser.add_argument_group("Generator") group.add( "--copy_attn", "-copy_attn", action="store_true", help="Train copy attention layer.", ) group.add( "--copy_attn_type", "-copy_attn_type", type=str, default=None, choices=["dot", "general", "mlp", "none"], help="The copy attention type to use. Leave as None to use " "the same as -global_attention.", ) group.add( "--generator_function", "-generator_function", default="softmax", choices=["softmax", "sparsemax"], help="Which function to use for generating " "probabilities over the target vocabulary (choices: " "softmax, sparsemax)", ) group.add( "--copy_attn_force", "-copy_attn_force", action="store_true", help="When available, train to copy.", ) group.add( "--reuse_copy_attn", "-reuse_copy_attn", action="store_true", help="Reuse standard attention for copy", ) group.add( "--copy_loss_by_seqlength", "-copy_loss_by_seqlength", action="store_true", help="Divide copy loss by length of sequence", ) group.add( "--coverage_attn", "-coverage_attn", action="store_true", help="Train a coverage attention layer.", ) group.add( "--lambda_coverage", "-lambda_coverage", type=float, default=0.0, help="Lambda value for coverage loss of See et al (2017)", ) group.add( "--lm_prior_model", "-lm_prior_model", type=str, default=None, help="LM model to used to train the TM", ) group.add( "--lm_prior_lambda", "-lambda_prior_lambda", type=float, default=0.0, help="LM Prior Lambda", ) group.add( "--lm_prior_tau", "-lambda_prior_tau", type=float, default=1.0, help="LM Prior Tau", ) group.add( "--loss_scale", "-loss_scale", type=float, default=0, help="For FP16 training, the static loss scale to use. If not " "set, the loss scale is dynamically computed.", ) group.add( "--apex_opt_level", "-apex_opt_level", type=str, default="", choices=["", "O0", "O1", "O2", "O3"], help="For FP16 training, the opt_level to use." "See https://nvidia.github.io/apex/amp.html#opt-levels.", ) group.add( "--zero_out_prompt_loss", "-zero_out_prompt_loss", action="store_true", help="Set the prompt loss to zero." "Mostly for LLM finetuning." "Will be enabled only if the `insert_mask_before_placeholder` transform is applied", ) group.add( "--use_ckpting", "-use_ckpting", default=[], nargs="+", choices=["ffn", "mha", "lora"], type=str, help="use gradient checkpointing those modules", ) def _add_train_general_opts(parser): """General options for training""" group = parser.add_argument_group("General") group.add( "--data_type", "-data_type", default="text", help="Type of the source input. " "Options are [text].", ) group.add( "--save_model", "-save_model", default="model", help="Model filename (the model will be saved as " "_N.pt where N is the number " "of steps", ) group.add( "--save_format", "-save_format", default="pytorch", choices=["pytorch", "safetensors"], help="Format to save the model weights", ) group.add( "--save_checkpoint_steps", "-save_checkpoint_steps", type=int, default=5000, help="""Save a checkpoint every X steps""", ) group.add( "--keep_checkpoint", "-keep_checkpoint", type=int, default=-1, help="Keep X checkpoints (negative: keep all)", ) # LoRa group.add( "--lora_layers", "-lora_layers", default=[], nargs="+", type=str, help="list of layers to be replaced by LoRa layers." " ex: ['linear_values', 'linear_query'] " " cf paper §4.2 https://arxiv.org/abs/2106.09685", ) group.add( "--lora_embedding", "-lora_embedding", action="store_true", help="replace embeddings with LoRa Embeddings see §5.1", ) group.add( "--lora_rank", "-lora_rank", type=int, default=2, help="r=2 successfully tested with NLLB-200 3.3B", ) group.add( "--lora_alpha", "-lora_alpha", type=int, default=1, help="§4.1 https://arxiv.org/abs/2106.09685", ) group.add( "--lora_dropout", "-lora_dropout", type=float, default=0.0, help="rule of thumb: same value as in main model", ) _add_reproducibility_opts(parser) # Init options group = parser.add_argument_group("Initialization") group.add( "--param_init", "-param_init", type=float, default=0.1, help="Parameters are initialized over uniform distribution " "with support (-param_init, param_init). " "Use 0 to not use initialization", ) group.add( "--param_init_glorot", "-param_init_glorot", action="store_true", help="Init parameters with xavier_uniform. " "Required for transformer.", ) group.add( "--train_from", "-train_from", default="", type=str, help="If training from a checkpoint then this is the " "path to the pretrained model's state_dict.", ) group.add( "--reset_optim", "-reset_optim", default="none", choices=["none", "all", "states", "keep_states"], help="Optimization resetter when train_from.", ) # Pretrained word vectors group.add( "--pre_word_vecs_enc", "-pre_word_vecs_enc", help="If a valid path is specified, then this will load " "pretrained word embeddings on the encoder side. " "See README for specific formatting instructions.", ) group.add( "--pre_word_vecs_dec", "-pre_word_vecs_dec", help="If a valid path is specified, then this will load " "pretrained word embeddings on the decoder side. " "See README for specific formatting instructions.", ) # Freeze word vectors group.add( "--freeze_word_vecs_enc", "-freeze_word_vecs_enc", action="store_true", help="Freeze word embeddings on the encoder side.", ) group.add( "--freeze_word_vecs_dec", "-freeze_word_vecs_dec", action="store_true", help="Freeze word embeddings on the decoder side.", ) # Optimization options group = parser.add_argument_group("Optimization- Type") group.add( "--num_workers", "-num_workers", type=int, default=2, help="pytorch DataLoader num_workers", ) group.add( "--batch_size", "-batch_size", type=int, default=64, help="Maximum batch size for training", ) group.add( "--batch_size_multiple", "-batch_size_multiple", type=int, default=1, help="Batch size multiple for token batches.", ) group.add( "--batch_type", "-batch_type", default="sents", choices=["sents", "tokens"], help="Batch grouping for batch_size. Standard " "is sents. Tokens will do dynamic batching", ) group.add( "--normalization", "-normalization", default="sents", choices=["sents", "tokens"], help="Normalization method of the gradient.", ) group.add( "--accum_count", "-accum_count", type=int, nargs="+", default=[1], help="Accumulate gradient this many times. " "Approximately equivalent to updating " "batch_size * accum_count batches at once. " "Recommended for Transformer.", ) group.add( "--accum_steps", "-accum_steps", type=int, nargs="+", default=[0], help="Steps at which accum_count values change", ) group.add( "--valid_steps", "-valid_steps", type=int, default=10000, help="Perfom validation every X steps", ) group.add( "--valid_batch_size", "-valid_batch_size", type=int, default=32, help="Maximum batch size for validation", ) group.add( "--train_steps", "-train_steps", type=int, default=100000, help="Number of training steps", ) group.add( "--single_pass", "-single_pass", action="store_true", help="Make a single pass over the training dataset.", ) group.add( "--early_stopping", "-early_stopping", type=int, default=0, help="Number of validation steps without improving.", ) group.add( "--early_stopping_criteria", "-early_stopping_criteria", nargs="*", default=None, help="Criteria to use for early stopping.", ) group.add( "--optim", "-optim", default="sgd", choices=[ "sgd", "adagrad", "adadelta", "adam", "sparseadam", "adafactor", "fusedadam", "adamw8bit", "pagedadamw8bit", "pagedadamw32bit", ], help="Optimization method.", ) group.add( "--adagrad_accumulator_init", "-adagrad_accumulator_init", type=float, default=0, help="Initializes the accumulator values in adagrad. " "Mirrors the initial_accumulator_value option " "in the tensorflow adagrad (use 0.1 for their default).", ) group.add( "--max_grad_norm", "-max_grad_norm", type=float, default=5, help="If the norm of the gradient vector exceeds this, " "renormalize it to have the norm equal to " "max_grad_norm", ) group.add( "--dropout", "-dropout", type=float, default=[0.3], nargs="+", help="Dropout probability; applied in LSTM stacks.", ) group.add( "--attention_dropout", "-attention_dropout", type=float, default=[0.1], nargs="+", help="Attention Dropout probability.", ) group.add( "--dropout_steps", "-dropout_steps", type=int, nargs="+", default=[0], help="Steps at which dropout changes.", ) group.add( "--truncated_decoder", "-truncated_decoder", type=int, default=0, help="""Truncated bptt.""", ) group.add( "--adam_beta1", "-adam_beta1", type=float, default=0.9, help="The beta1 parameter used by Adam. " "Almost without exception a value of 0.9 is used in " "the literature, seemingly giving good results, " "so we would discourage changing this value from " "the default without due consideration.", ) group.add( "--adam_beta2", "-adam_beta2", type=float, default=0.999, help="The beta2 parameter used by Adam. " "Typically a value of 0.999 is recommended, as this is " "the value suggested by the original paper describing " "Adam, and is also the value adopted in other frameworks " "such as Tensorflow and Keras, i.e. see: " "https://www.tensorflow.org/api_docs/python/tf/train/Adam" "Optimizer or https://keras.io/optimizers/ . " 'Whereas recently the paper "Attention is All You Need" ' "suggested a value of 0.98 for beta2, this parameter may " "not work well for normal models / default " "baselines.", ) group.add( "--label_smoothing", "-label_smoothing", type=float, default=0.0, help="Label smoothing value epsilon. " "Probabilities of all non-true labels " "will be smoothed by epsilon / (vocab_size - 1). " "Set to zero to turn off label smoothing. " "For more detailed information, see: " "https://arxiv.org/abs/1512.00567", ) group.add( "--average_decay", "-average_decay", type=float, default=0, help="Moving average decay. " "Set to other than 0 (e.g. 1e-4) to activate. " "Similar to Marian NMT implementation: " "http://www.aclweb.org/anthology/P18-4020 " "For more detail on Exponential Moving Average: " "https://en.wikipedia.org/wiki/Moving_average", ) group.add( "--average_every", "-average_every", type=int, default=1, help="Step for moving average. " "Default is every update, " "if -average_decay is set.", ) # learning rate group = parser.add_argument_group("Optimization- Rate") group.add( "--learning_rate", "-learning_rate", type=float, default=1.0, help="Starting learning rate. " "Recommended settings: sgd = 1, adagrad = 0.1, " "adadelta = 1, adam = 0.001", ) group.add( "--learning_rate_decay", "-learning_rate_decay", type=float, default=0.5, help="If update_learning_rate, decay learning rate by " "this much if steps have gone past " "start_decay_steps", ) group.add( "--start_decay_steps", "-start_decay_steps", type=int, default=50000, help="Start decaying every decay_steps after " "start_decay_steps", ) group.add( "--decay_steps", "-decay_steps", type=int, default=10000, help="Decay every decay_steps", ) group.add( "--decay_method", "-decay_method", type=str, default="none", choices=["noam", "noamwd", "rsqrt", "none"], help="Use a custom decay rate.", ) group.add( "--warmup_steps", "-warmup_steps", type=int, default=4000, help="Number of warmup steps for custom decay.", ) _add_logging_opts(parser, is_train=True) def _add_train_dynamic_data(parser): group = parser.add_argument_group("Dynamic data") group.add( "-bucket_size", "--bucket_size", type=int, default=262144, help="""A bucket is a buffer of bucket_size examples to pick from the various Corpora. The dynamic iterator batches batch_size batchs from the bucket and shuffle them.""", ) group.add( "-bucket_size_init", "--bucket_size_init", type=int, default=-1, help="""The bucket is initalized with this awith this amount of examples (optional)""", ) group.add( "-bucket_size_increment", "--bucket_size_increment", type=int, default=0, help="""The bucket size is incremented with this amount of examples (optional)""", ) group.add( "-prefetch_factor", "--prefetch_factor", type=int, default=200, help="""number of mini-batches loaded in advance to avoid the GPU waiting during the refilling of the bucket.""", ) def _add_quant_opts(parser): group = parser.add_argument_group("Quant options") group.add( "--quant_layers", "-quant_layers", default=[], nargs="+", type=str, help="list of layers to be compressed in 4/8bit.", ) group.add( "--quant_type", "-quant_type", default="bnb_8bit", choices=["bnb_8bit", "bnb_FP4", "bnb_NF4"], type=str, help="Type of compression.", ) def train_opts(parser): """All options used in train.""" # options relate to data preprare dynamic_prepare_opts(parser, build_vocab_only=False) distributed_opts(parser) # options relate to train model_opts(parser) _add_train_general_opts(parser) _add_train_dynamic_data(parser) _add_quant_opts(parser) def _add_decoding_opts(parser): group = parser.add_argument_group("Beam Search") beam_size = group.add( "--beam_size", "-beam_size", type=int, default=5, help="Beam size" ) group.add( "--ratio", "-ratio", type=float, default=-0.0, help="Ratio based beam stop condition", ) group = parser.add_argument_group("Random Sampling") group.add( "--random_sampling_topk", "-random_sampling_topk", default=0, type=int, help="Set this to -1 to do random sampling from full " "distribution. Set this to value k>1 to do random " "sampling restricted to the k most likely next tokens. " "Set this to 1 to use argmax.", ) group.add( "--random_sampling_topp", "-random_sampling_topp", default=0.0, type=float, help="Probability for top-p/nucleus sampling. Restrict tokens" " to the most likely until the cumulated probability is" " over p. In range [0, 1]." " https://arxiv.org/abs/1904.09751", ) group.add( "--random_sampling_temp", "-random_sampling_temp", default=1.0, type=float, help="If doing random sampling, divide the logits by " "this before computing softmax during decoding.", ) group._group_actions.append(beam_size) _add_reproducibility_opts(parser) group = parser.add_argument_group( "Penalties", ".. Note:: Coverage Penalty is not available in sampling." ) # Alpha and Beta values for Google Length + Coverage penalty # Described here: https://arxiv.org/pdf/1609.08144.pdf, Section 7 # Length penalty options group.add( "--length_penalty", "-length_penalty", default="avg", choices=["none", "wu", "avg"], help="Length Penalty to use.", ) group.add( "--alpha", "-alpha", type=float, default=1.0, help="Length penalty parameter" "(higher = longer generation)", ) # Coverage penalty options group.add( "--coverage_penalty", "-coverage_penalty", default="none", choices=["none", "wu", "summary"], help="Coverage Penalty to use. Only available in beam search.", ) group.add( "--beta", "-beta", type=float, default=-0.0, help="Coverage penalty parameter" ) group.add( "--stepwise_penalty", "-stepwise_penalty", action="store_true", help="Apply coverage penalty at every decoding step. " "Helpful for summary penalty.", ) group = parser.add_argument_group( "Decoding tricks", ".. Tip:: Following options can be used to limit the decoding length " "or content.", ) # Decoding Length constraint group.add( "--min_length", "-min_length", type=int, default=0, help="Minimum prediction length", ) group.add( "--max_length", "-max_length", type=int, default=250, help="Maximum prediction length.", ) # Decoding content constraint group.add( "--block_ngram_repeat", "-block_ngram_repeat", type=int, default=0, help="Block repetition of ngrams during decoding.", ) group.add( "--ignore_when_blocking", "-ignore_when_blocking", nargs="+", type=str, default=[], help="Ignore these strings when blocking repeats. " "You want to block sentence delimiters.", ) group.add( "--replace_unk", "-replace_unk", action="store_true", help="Replace the generated UNK tokens with the " "source token that had highest attention weight. If " "phrase_table is provided, it will look up the " "identified source token and give the corresponding " "target token. If it is not provided (or the identified " "source token does not exist in the table), then it " "will copy the source token.", ) group.add( "--ban_unk_token", "-ban_unk_token", action="store_true", help="Prevent unk token generation by setting unk proba to 0", ) group.add( "--phrase_table", "-phrase_table", type=str, default="", help="If phrase_table is provided (with replace_unk), it will " "look up the identified source token and give the " "corresponding target token. If it is not provided " "(or the identified source token does not exist in " "the table), then it will copy the source token.", ) def translate_opts(parser, dynamic=False): """Translation / inference options""" group = parser.add_argument_group("Model") group.add( "--model", "-model", dest="models", metavar="MODEL", nargs="+", type=str, default=[], required=True, help="Path to model .pt file(s). " "Multiple models can be specified, " "for ensemble decoding.", ) group.add( "--precision", "-precision", default="", choices=["", "fp32", "fp16", "int8"], help="Precision to run inference." "default is model.dtype" "fp32 to force slow FP16 model on GTX1080" "int8 enables pytorch native 8-bit quantization" "(cpu only)", ) group.add( "--fp32", "-fp32", action=DeprecateAction, help="Deprecated use 'precision' instead", ) group.add( "--int8", "-int8", action=DeprecateAction, help="Deprecated use 'precision' instead", ) group.add( "--avg_raw_probs", "-avg_raw_probs", action="store_true", help="If this is set, during ensembling scores from " "different models will be combined by averaging their " "raw probabilities and then taking the log. Otherwise, " "the log probabilities will be averaged directly. " "Necessary for models whose output layers can assign " "zero probability.", ) group = parser.add_argument_group("Data") group.add( "--data_type", "-data_type", default="text", help="Type of the source input. Options: [text].", ) group.add( "--src", "-src", required=True, help="Source sequence to decode (one line per " "sequence)", ) group.add("--tgt", "-tgt", help="True target sequence (optional)") group.add( "--tgt_file_prefix", "-tgt_file_prefix", action="store_true", help="Generate predictions using provided `-tgt` as prefix.", ) group.add( "--output", "-output", default="pred.txt", help="Path to output the predictions (each line will " "be the decoded sequence", ) group.add( "--report_align", "-report_align", action="store_true", help="Report alignment for each translation.", ) group.add( "--gold_align", "-gold_align", action="store_true", help="Report alignment between source and gold target." "Useful to test the performance of learnt alignments.", ) group.add( "--report_time", "-report_time", action="store_true", help="Report some translation time metrics", ) # Adding options related to source and target features _add_features_opts(parser) # Adding options relate to decoding strategy _add_decoding_opts(parser) # Adding option for logging _add_logging_opts(parser, is_train=False) distributed_opts(parser) group = parser.add_argument_group("Efficiency") group.add("--batch_size", "-batch_size", type=int, default=30, help="Batch size") group.add( "--batch_type", "-batch_type", default="sents", choices=["sents", "tokens"], help="Batch grouping for batch_size. Standard " "is sents. Tokens will do dynamic batching", ) group.add("--gpu", "-gpu", type=int, default=-1, help="Device to run on") if dynamic: group.add( "-transforms", "--transforms", default=[], nargs="+", choices=AVAILABLE_TRANSFORMS.keys(), help="Default transform pipeline to apply to data.", ) # Adding options related to Transforms _add_dynamic_transform_opts(parser) _add_quant_opts(parser) # Copyright 2016 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. class StoreLoggingLevelAction(configargparse.Action): """Convert string to logging level""" import logging LEVELS = { "CRITICAL": logging.CRITICAL, "ERROR": logging.ERROR, "WARNING": logging.WARNING, "INFO": logging.INFO, "DEBUG": logging.DEBUG, "NOTSET": logging.NOTSET, } CHOICES = list(LEVELS.keys()) + [str(_) for _ in LEVELS.values()] def __init__(self, option_strings, dest, help=None, **kwargs): super(StoreLoggingLevelAction, self).__init__( option_strings, dest, help=help, **kwargs ) def __call__(self, parser, namespace, value, option_string=None): # Get the key 'value' in the dict, or just use 'value' level = StoreLoggingLevelAction.LEVELS.get(value, value) setattr(namespace, self.dest, level) class DeprecateAction(configargparse.Action): """Deprecate action""" def __init__(self, option_strings, dest, help=None, **kwargs): super(DeprecateAction, self).__init__( option_strings, dest, nargs=0, help=help, **kwargs ) def __call__(self, parser, namespace, values, flag_name): help = self.help if self.help is not None else "" msg = "Flag '%s' is deprecated. %s" % (flag_name, help) raise configargparse.ArgumentTypeError(msg)