#!/usr/bin/env bash # # Usage: # # test-tokenizer-0.sh # if [ $# -ne 2 ]; then printf "Usage: $0 \n" exit 1 fi name=$1 input=$2 # Build using CMake if binary doesn't exist if [ ! -f ./build/bin/test-tokenizer-0 ]; then printf "Building test-tokenizer-0 with CMake...\n" cmake -B build -DLLAMA_BUILD_TESTS=ON cmake --build build --target test-tokenizer-0 -j fi printf "Testing %s on %s ...\n" $name $input set -e printf "Tokenizing using (py) Python AutoTokenizer ...\n" python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 printf "Tokenizing using (cpp) llama.cpp ...\n" ./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" set +e diff $input.tok $input.tokcpp > /dev/null 2>&1 if [ $? -eq 0 ]; then printf "Tokenization is correct!\n" else diff $input.tok $input.tokcpp | head -n 32 printf "Tokenization differs!\n" fi