File size: 8,302 Bytes
f9d7028 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# This script preprocesses and binarizes the data for training translation models using fairseq.
# Only difference between this script and `` that we generate
# fairseq dict using this script that is commonly shared across for training all the models further.
echo `date`
exp_dir=$1 # path to the experiment directory
vocab_dir=${2:-"$exp_dir/vocab"} # path to the spm-based tokenizer directory
train_data_dir=${3:-"$exp_dir/train"} # path to the train data within experiment directory
devtest_data_dir=${4:-"$exp_dir/devtest/all"} # path to the devtest data within experiment directory
root=$(dirname $0)
echo "Running experiment ${exp_dir}"
mkdir -p $train_processed_dir
mkdir -p $devtest_processed_dir
mkdir -p $out_data_dir
# Check if GNU Parallel is installed
if command -v parallel &> /dev/null; then
echo "GNU Parallel is installed. Version information:"
parallel --version
# get a list of language pairs in the `train_data_dir`
pairs=$(ls -d $train_data_dir/* | sort)
# iterate over each language pair
for pair in ${pairs[@]}; do
# extract the source and target languages from the pair name
pair=$(basename $pair)
src_lang=$(echo "$pair" | cut -d "-" -f 1)
tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
echo "$src_lang - $tgt_lang"
mkdir -p $train_norm_dir
mkdir -p $devtest_norm_dir
# check if the source language text requires transliteration
if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \
[[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then
# check if the target language text requires transliteration
if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \
[[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then
# --------------------------------------------------------------------------
# train preprocessing
# --------------------------------------------------------------------------
echo "Normalizing punctuations for train"
if $parallel_installed; then
parallel --pipe --keep-order bash $root/ $src_lang < $train_infname_src > $train_outfname_src._norm
parallel --pipe --keep-order bash $root/ $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
bash $root/ $src_lang < $train_infname_src > $train_outfname_src._norm
bash $root/ $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
# add do not translate tags to handle special failure cases
echo "Applying do not translate tags for train"
python3 scripts/ $train_outfname_src._norm $train_outfname_tgt._norm $train_outfname_src.norm $train_outfname_tgt.norm
echo "Applying normalization and script conversion for train"
# this script preprocesses the text and for indic languages, converts script to devanagari if needed
input_size=`python3 scripts/ $train_outfname_src.norm $train_outfname_src $src_lang $src_transliterate false`
input_size=`python3 scripts/ $train_outfname_tgt.norm $train_outfname_tgt $tgt_lang $tgt_transliterate true`
echo "Number of sentences in train: $input_size"
# --------------------------------------------------------------------------
# dev preprocessing
# --------------------------------------------------------------------------
echo "Normalizing punctuations for dev"
if $parallel_installed; then
parallel --pipe --keep-order bash $src_lang < $dev_infname_src > $dev_outfname_src._norm
parallel --pipe --keep-order bash $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
bash $src_lang < $dev_infname_src > $dev_outfname_src._norm
bash $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
# add do not translate tags to handle special failure cases
echo "Applying do not translate tags for dev"
python3 scripts/ $dev_outfname_src._norm $dev_outfname_tgt._norm $dev_outfname_src.norm $dev_outfname_tgt.norm
echo "Applying normalization and script conversion for dev"
# this script preprocesses the text and for indic languages, converts script to devanagari if needed
input_size=`python scripts/ $dev_outfname_src.norm $dev_outfname_src $src_lang $src_transliterate false`
input_size=`python scripts/ $dev_outfname_tgt.norm $dev_outfname_tgt $tgt_lang $tgt_transliterate true`
echo "Number of sentences in dev: $input_size"
# this concatenates lang pair data and creates text files to keep track of number of
# lines in each lang pair. this is important for joint training, as we will merge all
# the lang pairs and the indivitual lang lines info would be required for adding specific
# lang tags later.
# the outputs of these scripts will be text file like this:
# <lang1> <lang2> <number of lines>
# lang1-lang2 n1
# lang1-lang3 n2
python scripts/ $exp_dir/norm $exp_dir/data 'train'
python scripts/ $exp_dir/norm $exp_dir/data 'dev'
# tokenization of train and dev set using the spm trained models
mkdir -p $exp_dir/bpe
splits=(train dev)
for split in ${splits[@]}; do
echo "Applying sentence piece for $split"
bash $exp_dir $exp_dir/data $exp_dir/bpe SRC TGT $split $parallel_installed
# this is only required for joint training
# we apply language tags to the bpe segmented data
# if we are translating lang1 to lang2 then <lang1 line> will become <lang1> <lang2> <lang1 line>
mkdir -p $exp_dir/final
echo "Adding language tags"
python scripts/ $exp_dir 'train'
python scripts/ $exp_dir 'dev'
# this is important step if you are training with tpu and using num_batch_buckets
# the current implementation does not remove outliers before bucketing and hence
# removing these large sentences ourselves helps with getting better buckets
# python scripts/ $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT
# python scripts/ $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT
# python scripts/ $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT
echo "Binarizing data"
# use cpu_count to get num_workers instead of setting it manually when running
# in different instances
num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
rm -rf $out_data_dir
fairseq-preprocess \
--source-lang SRC --target-lang TGT \
--trainpref $data_dir/train \
--validpref $data_dir/dev \
--destdir $out_data_dir \
--workers $num_workers \
--thresholdtgt 5