training / flax /pseudo_labelling_scripts /run_pseudo_labelling.sh

Saving train state of step 1

a1be16b verified 6 months ago

1.92 kB

	#!/usr/bin/env bash

	MODEL_NAME="openai/whisper-large-v3"
	CACHE_DIR="/home/sanchitgandhi/.cache"
	OUTPUT_DIR="./transcriptions-streaming"
	WANDB_DIR="/home/sanchitgandhi/.cache"
	WANDB_PROJECT="distil-whisper-label"
	BATCH_SIZE=64
	NUM_BEAMS=1
	MAX_LABEL_LENGTH=256
	LOGGING_STEPS=500
	NUM_WORKERS=64
	RETURN_TIMESTAMPS=False

	python run_pseudo_labelling.py \
	--model_name_or_path $MODEL_NAME \
	--dataset_name "distil-whisper/librispeech_asr" \
	--dataset_config_name "all" \
	--data_split_name "train.other.500+validation.clean+validation.other+test.clean+test.other" \
	--wandb_name "whisper-large-v2-librispeech_asr" \
	--cache_dir $CACHE_DIR \
	--dataset_cache_dir $CACHE_DIR \
	--output_dir $OUTPUT_DIR \
	--wandb_dir $WANDB_DIR \
	--wandb_project $WANDB_PROJECT \
	--per_device_eval_batch_size $BATCH_SIZE \
	--generation_num_beams $NUM_BEAMS \
	--max_label_length $MAX_LABEL_LENGTH \
	--logging_steps $LOGGING_STEPS \
	--dataloader_num_workers $NUM_WORKERS \
	--dtype "bfloat16" \
	--report_to "wandb" \
	--streaming True \
	--push_to_hub \
	--return_timestamps $RETURN_TIMESTAMPS \
	--compilation_cache $CACHE_DIR

	python run_pseudo_labelling.py \
	--model_name_or_path $MODEL_NAME \
	--dataset_name "distil-whisper/peoples_speech-clean" \
	--dataset_config_name "clean" \
	--data_split_name "train+validation+test" \
	--wandb_name "whisper-large-v2-peoples_speech-clean" \
	--cache_dir $CACHE_DIR \
	--dataset_cache_dir $CACHE_DIR \
	--output_dir $OUTPUT_DIR \
	--wandb_dir $WANDB_DIR \
	--wandb_project $WANDB_PROJECT \
	--per_device_eval_batch_size $BATCH_SIZE \
	--generation_num_beams $NUM_BEAMS \
	--max_label_length $MAX_LABEL_LENGTH \
	--logging_steps $LOGGING_STEPS \
	--dataloader_num_workers $NUM_WORKERS \
	--dtype "bfloat16" \
	--report_to "wandb" \
	--streaming True \
	--push_to_hub \
	--return_timestamps $RETURN_TIMESTAMPS \
	--compilation_cache $CACHE_DIR