|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo `date`
|
|
exp_dir=$1
|
|
vocab_dir=${2:-"$exp_dir/vocab"}
|
|
train_data_dir=${3:-"$exp_dir/train"}
|
|
devtest_data_dir=${4:-"$exp_dir/devtest/all"}
|
|
|
|
root=$(dirname $0)
|
|
|
|
echo "Running experiment ${exp_dir}"
|
|
|
|
train_processed_dir=$exp_dir/data
|
|
devtest_processed_dir=$exp_dir/data
|
|
out_data_dir=$exp_dir/final_bin
|
|
|
|
mkdir -p $train_processed_dir
|
|
mkdir -p $devtest_processed_dir
|
|
mkdir -p $out_data_dir
|
|
|
|
parallel_installed=false
|
|
|
|
|
|
if command -v parallel &> /dev/null; then
|
|
echo "GNU Parallel is installed. Version information:"
|
|
parallel --version
|
|
parallel_installed=true
|
|
fi
|
|
|
|
|
|
pairs=$(ls -d $train_data_dir/* | sort)
|
|
|
|
|
|
|
|
for pair in ${pairs[@]}; do
|
|
|
|
pair=$(basename $pair)
|
|
src_lang=$(echo "$pair" | cut -d "-" -f 1)
|
|
tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
|
|
echo "$src_lang - $tgt_lang"
|
|
|
|
train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
|
|
devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
|
|
mkdir -p $train_norm_dir
|
|
mkdir -p $devtest_norm_dir
|
|
|
|
|
|
|
|
src_transliterate="true"
|
|
if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \
|
|
[[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then
|
|
src_transliterate="false"
|
|
fi
|
|
|
|
|
|
tgt_transliterate="true"
|
|
if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \
|
|
[[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then
|
|
tgt_transliterate="false"
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
train_infname_src=$train_data_dir/${src_lang}-${tgt_lang}/train.$src_lang
|
|
train_infname_tgt=$train_data_dir/${src_lang}-${tgt_lang}/train.$tgt_lang
|
|
train_outfname_src=$train_norm_dir/train.$src_lang
|
|
train_outfname_tgt=$train_norm_dir/train.$tgt_lang
|
|
|
|
echo "Normalizing punctuations for train"
|
|
if $parallel_installed; then
|
|
parallel --pipe --keep-order bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm
|
|
parallel --pipe --keep-order bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
|
|
else
|
|
bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm
|
|
bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
|
|
fi
|
|
|
|
|
|
echo "Applying do not translate tags for train"
|
|
python3 scripts/normalize_regex.py $train_outfname_src._norm $train_outfname_tgt._norm $train_outfname_src.norm $train_outfname_tgt.norm
|
|
|
|
echo "Applying normalization and script conversion for train"
|
|
|
|
input_size=`python3 scripts/preprocess_translate.py $train_outfname_src.norm $train_outfname_src $src_lang $src_transliterate false`
|
|
input_size=`python3 scripts/preprocess_translate.py $train_outfname_tgt.norm $train_outfname_tgt $tgt_lang $tgt_transliterate true`
|
|
echo "Number of sentences in train: $input_size"
|
|
|
|
|
|
|
|
|
|
|
|
dev_infname_src=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$src_lang
|
|
dev_infname_tgt=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$tgt_lang
|
|
dev_outfname_src=$devtest_norm_dir/dev.$src_lang
|
|
dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang
|
|
|
|
echo "Normalizing punctuations for dev"
|
|
if $parallel_installed; then
|
|
parallel --pipe --keep-order bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm
|
|
parallel --pipe --keep-order bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
|
|
else
|
|
bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm
|
|
bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
|
|
fi
|
|
|
|
|
|
echo "Applying do not translate tags for dev"
|
|
python3 scripts/normalize_regex.py $dev_outfname_src._norm $dev_outfname_tgt._norm $dev_outfname_src.norm $dev_outfname_tgt.norm
|
|
|
|
echo "Applying normalization and script conversion for dev"
|
|
|
|
input_size=`python scripts/preprocess_translate.py $dev_outfname_src.norm $dev_outfname_src $src_lang $src_transliterate false`
|
|
input_size=`python scripts/preprocess_translate.py $dev_outfname_tgt.norm $dev_outfname_tgt $tgt_lang $tgt_transliterate true`
|
|
echo "Number of sentences in dev: $input_size"
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'train'
|
|
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'dev'
|
|
|
|
|
|
|
|
mkdir -p $exp_dir/bpe
|
|
|
|
splits=(train dev)
|
|
for split in ${splits[@]}; do
|
|
echo "Applying sentence piece for $split"
|
|
bash apply_sentence_piece.sh $exp_dir $exp_dir/data $exp_dir/bpe SRC TGT $split $parallel_installed
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
mkdir -p $exp_dir/final
|
|
|
|
echo "Adding language tags"
|
|
python scripts/add_joint_tags_translate.py $exp_dir 'train'
|
|
python scripts/add_joint_tags_translate.py $exp_dir 'dev'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "Binarizing data"
|
|
|
|
|
|
|
|
num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
|
|
|
|
data_dir=$exp_dir/final
|
|
out_data_dir=$exp_dir/final_bin
|
|
|
|
rm -rf $out_data_dir
|
|
|
|
fairseq-preprocess \
|
|
--source-lang SRC --target-lang TGT \
|
|
--trainpref $data_dir/train \
|
|
--validpref $data_dir/dev \
|
|
--destdir $out_data_dir \
|
|
--workers $num_workers \
|
|
--thresholdtgt 5
|
|
|