INFERENCE_PRECISION=float16 | |
MAX_BEAM_WIDTH=4 | |
MAX_BATCH_SIZE=64 | |
checkpoint_dir=multi_zh_distil_tllm_checkpoint_pos_emb_true | |
output_dir=distil_whisper_multi_zh_remove_padding | |
# python3 convert_checkpoint.py \ | |
# --output_dir $checkpoint_dir \ | |
# --model_name distil-large-v2 | |
# checkpoint_dir=multi_zh_distil_tllm_int8_checkpoint_pos_emb_true | |
# output_dir=distil_whisper_multi_zh_int8_remove_padding | |
# python3 convert_checkpoint.py --use_weight_only \ | |
# --weight_only_precision int8 \ | |
# --output_dir $checkpoint_dir --model_name distil-large-v2 | |
trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ | |
--output_dir ${output_dir}/encoder \ | |
--moe_plugin disable \ | |
--enable_xqa disable \ | |
--max_batch_size ${MAX_BATCH_SIZE} \ | |
--gemm_plugin disable \ | |
--bert_attention_plugin ${INFERENCE_PRECISION} \ | |
--max_input_len 3000 --max_seq_len=3000 | |
trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ | |
--output_dir ${output_dir}/decoder \ | |
--moe_plugin disable \ | |
--enable_xqa disable \ | |
--max_beam_width ${MAX_BEAM_WIDTH} \ | |
--max_batch_size ${MAX_BATCH_SIZE} \ | |
--max_seq_len 114 \ | |
--max_input_len 14 \ | |
--max_encoder_input_len 3000 \ | |
--gemm_plugin ${INFERENCE_PRECISION} \ | |
--bert_attention_plugin ${INFERENCE_PRECISION} \ | |
--gpt_attention_plugin ${INFERENCE_PRECISION} | |
# batch_size=32 | |
# padding_strategy=zero | |
# dataset=wenet-e2e/wenetspeech | |
# dataset_name=TEST_MEETING | |
# python3 run.py --engine_dir $output_dir \ | |
# --enable_warmup \ | |
# --dataset $dataset \ | |
# --dataset_name $dataset_name \ | |
# --dataset_split test \ | |
# --compute_cer \ | |
# --text_prefix "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ | |
# --name aishell_${dataset_name}_${output_dir}_padding_${padding_strategy}_batch_${batch_size}_cppsession \ | |
# --batch_size $batch_size --padding_strategy $padding_strategy |