# | |
# USAGE preprocess.sh langid spmodel < input > output | |
# | |
# replace SPMENCODE with your own setup! | |
# | |
# CHANGES | |
# | |
# * issue with perl code that removes control characters | |
# unicode property Other = \p{C}) seems to remove | |
# newline characters as well --> add negative lookahead | |
# to avoid removing newline characters! | |
# | |
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"` | |
## simple pre-processing steps adapted from Moses tools | |
sed -e 's/,/,/g' \ | |
-e 's/。 */. /g' \ | |
-e 's/、/,/g' \ | |
-e 's/”/"/g' \ | |
-e 's/“/"/g' \ | |
-e 's/∶/:/g' \ | |
-e 's/:/:/g' \ | |
-e 's/?/\?/g' \ | |
-e 's/《/"/g' \ | |
-e 's/》/"/g' \ | |
-e 's/)/\)/g' \ | |
-e 's/!/\!/g' \ | |
-e 's/(/\(/g' \ | |
-e 's/;/;/g' \ | |
-e 's/1/"/g' \ | |
-e 's/」/"/g' \ | |
-e 's/「/"/g' \ | |
-e 's/0/0/g' \ | |
-e 's/3/3/g' \ | |
-e 's/2/2/g' \ | |
-e 's/5/5/g' \ | |
-e 's/6/6/g' \ | |
-e 's/9/9/g' \ | |
-e 's/7/7/g' \ | |
-e 's/8/8/g' \ | |
-e 's/4/4/g' \ | |
-e 's/. */. /g' \ | |
-e 's/~/\~/g' \ | |
-e "s/’/\'/g" \ | |
-e 's/…/\.\.\./g' \ | |
-e 's/━/\-/g' \ | |
-e 's/〈/\</g' \ | |
-e 's/〉/\>/g' \ | |
-e 's/【/\[/g' \ | |
-e 's/】/\]/g' \ | |
-e 's/%/\%/g' | | |
perl -C -pe 's/(?!\n)\p{C}/ /g;' | | |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\ | |
sed 's/ */ /g;s/^ *//g;s/ *$//g' | |