File size: 921 Bytes
b664585 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
#!/bin/bash
#
# Usage:
#
# test-tokenizer-0.sh <name> <input>
#
if [ $# -ne 2 ]; then
printf "Usage: $0 <name> <input>\n"
exit 1
fi
name=$1
input=$2
make -j tests/test-tokenizer-0
printf "Testing %s on %s ...\n" $name $input
set -e
printf "Tokenizing using (py) Python AutoTokenizer ...\n"
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
printf "Tokenizing using (cpp) llama.cpp ...\n"
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
set +e
diff $input.tok $input.tokcpp > /dev/null 2>&1
if [ $? -eq 0 ]; then
printf "Tokenization is correct!\n"
else
diff $input.tok $input.tokcpp | head -n 32
printf "Tokenization differs!\n"
fi
|