File size: 795 Bytes
f9d7028 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
#!/bin/bash
# This script normalizes the punctuations and strips the extra spaces in the input text
# Directly sourced from https://github.com/pluiez/NLLB-inference
set -euo pipefail
root=$(dirname $0)
lang_map_path=$root/utils.map_token_lang.tsv
usage () {
echo "usage: $0 lang" >&2
exit 1
}
[ $# -eq 1 ] || usage
lang=$1
declare -A lang_map
while read line; do
key=$(cut -f1 <<< "$line")
val=$(cut -f2 <<< "$line")
lang_map[$key]=$val
done < $lang_map_path
if [ -v "lang_map[$lang]" ]; then
lang=${lang_map[$lang]}
elif [ -v "lang_map[${lang:0:3}]" ]; then
lang=${lang_map[${lang:0:3}]}
else
echo "undefined mapping: ${lang}, falling back to: en" >&2
lang=en
fi
perl $root/normalize-punctuation.perl $lang
|