|
#!/usr/bin/env bash |
|
|
|
python run_parler_tts_training.py \ |
|
--model_name_or_path parler-tts/parler_tts_mini_v0.1 \ |
|
--feature_extractor_name parler-tts/dac_44khZ_8kbps \ |
|
--description_tokenizer_name parler-tts/parler_tts_mini_v0.1 \ |
|
--prompt_tokenizer_name parler-tts/parler_tts_mini_v0.1 \ |
|
--report_to wandb \ |
|
--overwrite_output_dir true \ |
|
--train_dataset_name reach-vb/expresso-tagged-mistral-7b-instruct-v0.2 \ |
|
--train_metadata_dataset_name reach-vb/expresso-tagged-mistral-7b-instruct-v0.2 \ |
|
--train_dataset_config_name read \ |
|
--train_split_name train \ |
|
--eval_dataset_name reach-vb/expresso-tagged-mistral-7b-instruct-v0.2 \ |
|
--eval_metadata_dataset_name reach-vb/expresso-tagged-mistral-7b-instruct-v0.2 \ |
|
--eval_dataset_config_name read \ |
|
--eval_split_name train \ |
|
--max_eval_samples 8 \ |
|
--per_device_eval_batch_size 16 \ |
|
--target_audio_column_name audio \ |
|
--description_column_name text_description \ |
|
--prompt_column_name text \ |
|
--max_duration_in_seconds 20 \ |
|
--min_duration_in_seconds 2.0 \ |
|
--max_text_length 400 \ |
|
--preprocessing_num_workers 2 \ |
|
--do_train true \ |
|
--num_train_epochs 10 \ |
|
--gradient_accumulation_steps 4 \ |
|
--gradient_checkpointing true \ |
|
--per_device_train_batch_size 32 \ |
|
--learning_rate 3e-5 \ |
|
--adam_beta1 0.9 \ |
|
--adam_beta2 0.99 \ |
|
--weight_decay 0.01 \ |
|
--warmup_steps 100 \ |
|
--logging_steps 2 \ |
|
--freeze_text_encoder true \ |
|
--audio_encoder_per_device_batch_size 4 \ |
|
--dtype bfloat16 \ |
|
--seed 456 \ |
|
--output_dir ./ \ |
|
--temporary_save_to_disk ../audio_code_tmp/ \ |
|
--save_to_disk ../tmp_dataset_audio/ \ |
|
--dataloader_num_workers 4 \ |
|
--do_eval \ |
|
--predict_with_generate \ |
|
--include_inputs_for_metrics \ |
|
--group_by_length true \ |
|
--push_to_hub |
|
|
|
|