vocab: add gemma4 tokenizer tests, fix edge case (#21534)

* YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests!
* Remove unnecessary hash  from update script.
* minor: move constant
This commit is contained in:
Piotr Wilkin (ilintar) 2026-04-09 11:41:14 +02:00 committed by GitHub
parent 243532e556
commit 0ec191e1d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 169 additions and 2 deletions

Binary file not shown.

View File

@ -0,0 +1,111 @@
ied 4 ½ months
__ggml_vocab_test__
Äpfel
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
!!!!!!
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
Cửa Việt
__ggml_vocab_test__
discards
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL

View File

@ -0,0 +1,46 @@
1178 236743 236812 47041 3794
239122 22744 535
236743
138
139
255968
107
108
109
255968 107
9259 1902
26352 1902
9259 4109
26352 4109
26352 4109 236888
9259 236764 1902 236888
26352 236764 1902 236888
672 563 236743 478 397 404 391 236761 12362
236765 236771 236812 236828 236743 236832 11372 12065 31806 3405 9360
1337 12515 1333 4632 165543 3830
234889 63031 219876 66212 239077 237907 144494
242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 568 8960 64334 600 815 1061 1852 8369 236768
9259
26352
138 9259
139 9259
140 9259
140 9259 107 140 9259
568
107 578
236789 6933
9259 236764 570 236789 712 236888 2088 659 611 170124 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352
123947
236800
236800 236800
236800 236800 236800
236800 236800 236800 236800
236800 236800 236800 236800 236800
236800 236800 236800 236800 236800 236800
236800 236800 236800 236800 236800 236800 236800
236800 236800 236800 236800 236800 236800 236800 236800
236800 236800 236800 236800 236800 236800 236800 236800 236800
236780 29719 33154
2243 2206
107 236743 108 236743 109 236743 255968 236743 255969 236743 255968 107 138 107 139 107 140 107 141 107 242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 236743 478 397 404 391 478 397 404 391 236743 236800 236743 236800 236800 236743 236800 236800 236800 236743 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236800 236743 236800 236761 236800 236743 236800 856 236800 236743 236800 1390 236800 90986 92814 63031 219876 66212 241702 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352 80448 120697 210119 1333 4632 165543 3830 9451 159561 2629 2629 2717 84491 19938 123947 38950 10371 564 236789 560 1010 756 151812 668 236789 236751 993 236764 756 1357 611 2889 236881 756 236792 711 2889 564 236789 859 1386 625 236764 756 236796 611 1133 1070 11115 236881 1191 236789 32541 496 236789 95635

View File

@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {
if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
auto token_multibyte = vocab.text_to_token(byte_str);
llama_token token_multibyte = LLAMA_TOKEN_NULL;
if (tokenizer.byte_encode) {
std::string byte_str(1, *j);
token_multibyte = vocab.text_to_token(byte_str);
} else {
// For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
static const char * hex = "0123456789ABCDEF";
const uint8_t ch = (uint8_t)*j;
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
token_multibyte = vocab.text_to_token(buf);
}
if (token_multibyte != LLAMA_TOKEN_NULL) {
output.push_back(token_multibyte);
}

View File

@ -124,6 +124,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gemma-4 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gemma-4.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)