|
#!/bin/bash
|
|
|
|
|
|
|
|
|
|
echo `date`
|
|
exp_dir=$1
|
|
data_dir=$2
|
|
bpe_dir=$3
|
|
src_lang=$4
|
|
tgt_lang=$5
|
|
split=$6
|
|
parallel_installed=${7:-false}
|
|
|
|
in_split_dir=$data_dir/$split
|
|
out_split_dir=$bpe_dir/$split
|
|
|
|
echo "Apply Sentence Piece tokenization to SRC corpus"
|
|
|
|
|
|
if $parallel_installed; then
|
|
parallel --pipe --keep-order \
|
|
spm_encode --model=$exp_dir/vocab/model.SRC \
|
|
--output_format=piece \
|
|
< $in_split_dir.$src_lang \
|
|
> $out_split_dir.$src_lang
|
|
else
|
|
spm_encode --model=$exp_dir/vocab/model.SRC \
|
|
--output_format=piece \
|
|
< $in_split_dir.$src_lang \
|
|
> $out_split_dir.$src_lang
|
|
fi
|
|
|
|
echo "Apply Sentence Piece tokenization to TGT corpus"
|
|
|
|
|
|
if $parallel_installed; then
|
|
parallel --pipe --keep-order \
|
|
spm_encode --model=$exp_dir/vocab/model.TGT \
|
|
--output_format=piece \
|
|
< $in_split_dir.$tgt_lang \
|
|
> $out_split_dir.$tgt_lang
|
|
else
|
|
spm_encode --model=$exp_dir/vocab/model.TGT \
|
|
--output_format=piece \
|
|
< $in_split_dir.$tgt_lang \
|
|
> $out_split_dir.$tgt_lang
|
|
fi |