File size: 8,305 Bytes
f9d7028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#/bin/bash

# This script preprocesses and binarizes the data using shared fairseq dict generated from 
# `prepare_data_joint_training.sh` initially for training translation models using fairseq.
# We primarily this script for training all our models.


echo `date`
exp_dir=$1                                      # path to the experiment directory
vocab_dir=${2:-"$exp_dir/vocab"}                # path to the spm-based tokenizer directory
train_data_dir=${3:-"$exp_dir/train"}           # path to the train data within experiment directory
devtest_data_dir=${4:-"$exp_dir/devtest/all"}   # path to the devtest data within experiment directory

root=$(dirname $0)

echo "Running experiment ${exp_dir}"

train_processed_dir=$exp_dir/data
devtest_processed_dir=$exp_dir/data
out_data_dir=$exp_dir/final_bin

mkdir -p $train_processed_dir
mkdir -p $devtest_processed_dir
mkdir -p $out_data_dir

parallel_installed=false

# Check if GNU Parallel is installed
if command -v parallel &> /dev/null; then
    echo "GNU Parallel is installed. Version information:"
    parallel --version
    parallel_installed=true
fi

# get a list of language pairs in the `train_data_dir`
pairs=$(ls -d $train_data_dir/* | sort)


# iterate over each language pair
for pair in ${pairs[@]}; do
    # extract the source and target languages from the pair name
    pair=$(basename $pair)
    src_lang=$(echo "$pair" | cut -d "-" -f 1)
    tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
    echo "$src_lang - $tgt_lang"

    train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
	devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
	mkdir -p $train_norm_dir
	mkdir -p $devtest_norm_dir

    
    # check if the source language text requires transliteration
    src_transliterate="true"
    if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \
        [[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then
        src_transliterate="false"
    fi
    
    # check if the target language text requires transliteration
    tgt_transliterate="true"
    if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \
        [[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then
        tgt_transliterate="false"
    fi


    # --------------------------------------------------------------------------
    #                           train preprocessing
    # --------------------------------------------------------------------------
	train_infname_src=$train_data_dir/${src_lang}-${tgt_lang}/train.$src_lang
	train_infname_tgt=$train_data_dir/${src_lang}-${tgt_lang}/train.$tgt_lang
	train_outfname_src=$train_norm_dir/train.$src_lang
	train_outfname_tgt=$train_norm_dir/train.$tgt_lang

    echo "Normalizing punctuations for train"
    if $parallel_installed; then
        parallel --pipe --keep-order bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm
        parallel --pipe --keep-order bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
    else
        bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm
        bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
    fi

	# add do not translate tags to handle special failure cases
    echo "Applying do not translate tags for train"
    python3 scripts/normalize_regex.py $train_outfname_src._norm $train_outfname_tgt._norm $train_outfname_src.norm $train_outfname_tgt.norm

	echo "Applying normalization and script conversion for train"
    # this script preprocesses the text and for indic languages, converts script to devanagari if needed
	input_size=`python3 scripts/preprocess_translate.py $train_outfname_src.norm $train_outfname_src $src_lang $src_transliterate false`
	input_size=`python3 scripts/preprocess_translate.py $train_outfname_tgt.norm $train_outfname_tgt $tgt_lang $tgt_transliterate true`
	echo "Number of sentences in train: $input_size"


    # --------------------------------------------------------------------------
    #                              dev preprocessing
    # --------------------------------------------------------------------------
	dev_infname_src=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$src_lang
	dev_infname_tgt=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$tgt_lang
	dev_outfname_src=$devtest_norm_dir/dev.$src_lang
	dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang

    echo "Normalizing punctuations for dev"
    if $parallel_installed; then
        parallel --pipe --keep-order bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm
        parallel --pipe --keep-order bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
    else
        bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm
        bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
    fi

	# add do not translate tags to handle special failure cases
    echo "Applying do not translate tags for dev"
    python3 scripts/normalize_regex.py $dev_outfname_src._norm $dev_outfname_tgt._norm $dev_outfname_src.norm $dev_outfname_tgt.norm

    echo "Applying normalization and script conversion for dev"
    # this script preprocesses the text and for indic languages, converts script to devanagari if needed
	input_size=`python scripts/preprocess_translate.py $dev_outfname_src.norm $dev_outfname_src $src_lang $src_transliterate false`
	input_size=`python scripts/preprocess_translate.py $dev_outfname_tgt.norm $dev_outfname_tgt $tgt_lang $tgt_transliterate true`
	echo "Number of sentences in dev: $input_size"
done


# this concatenates lang pair data and creates text files to keep track of number of 
# lines in each lang pair. this is important for joint training, as we will merge all 
# the lang pairs and the indivitual lang lines info would be required for adding specific 
# lang tags later.
# the outputs of these scripts will  be text file like this:
# <lang1> <lang2> <number of lines>
# lang1-lang2 n1
# lang1-lang3 n2
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'train'
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'dev'


# tokenization of train and dev set using the spm trained models
mkdir -p $exp_dir/bpe

splits=(train dev)
for split in ${splits[@]}; do
	echo "Applying sentence piece for $split"
	bash apply_sentence_piece.sh $exp_dir $exp_dir/data $exp_dir/bpe SRC TGT $split $parallel_installed
done


# this is only required for joint training
# we apply language tags to the bpe segmented data
# if we are translating lang1 to lang2 then <lang1 line> will become <lang1> <lang2> <lang1 line>
mkdir -p $exp_dir/final

echo "Adding language tags"
python scripts/add_joint_tags_translate.py $exp_dir 'train'
python scripts/add_joint_tags_translate.py $exp_dir 'dev'


# this is important step if you are training with tpu and using num_batch_buckets
# the current implementation does not remove outliers before bucketing and hence
# removing these large sentences ourselves helps with getting better buckets
# python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT
# python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT
# python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT


echo "Binarizing data"

# use cpu_count to get num_workers instead of setting it manually when running 
# in different instances
num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`

data_dir=$exp_dir/final
out_data_dir=$exp_dir/final_bin

fairseq-preprocess \
    --source-lang SRC --target-lang TGT \
    --trainpref $data_dir/train \
    --validpref $data_dir/dev \
    --destdir $out_data_dir \
    --workers $num_workers \
    --srcdict $exp_dir/final_bin/dict.SRC.txt \
    --tgtdict $exp_dir/final_bin/dict.TGT.txt \