From 0c0a0dcc888d580db07ff67881419b8713b05c1d Mon Sep 17 00:00:00 2001 From: hourhl Date: Sun, 11 Jan 2026 16:31:53 +0800 Subject: [PATCH 1/2] fix: OOB reads in UGM tokenizer (precompiled_charsmap handling) - Validate minimum size (4 bytes) before reading xcda_blob_size - Use strnlen with bounds check instead of unsafe strlen Both issues allow heap-buffer-overflow from malicious T5/UGM GGUF files. --- src/llama-vocab.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a20c6525e4..c9be238326 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -783,6 +783,9 @@ struct llm_tokenizer_ugm : llm_tokenizer { // First four bytes of precompiled_charsmap contains length of binary // blob containing XOR-compressed compact double array (XCDA) entries + if (precompiled_charsmap.size() < sizeof(uint32_t)) { + throw std::runtime_error("precompiled_charsmap too small for xcda_blob_size header!"); + } uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0]; charsmap_offset += sizeof(xcda_blob_size); if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) { @@ -1103,7 +1106,13 @@ private: throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); } const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset]; - return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; + // Use strnlen to safely bound the search within prefix_replacements + size_t max_len = tokenizer.prefix_replacements_size - longest_prefix_offset; + size_t repl_len = strnlen(prefix_replacement, max_len); + if (repl_len == max_len && prefix_replacement[max_len - 1] != '\0') { + throw std::runtime_error("Unterminated string in precompiled charsmap!"); + } + return { prefix_replacement, repl_len, longest_prefix_length }; } // check if the input prefix contains a valid sequence of UTF-8 code units From 8bcd53b74ec12b760adac186a98933c09ed29ad0 Mon Sep 17 00:00:00 2001 From: hourhl Date: Mon, 12 Jan 2026 15:53:05 +0800 Subject: [PATCH 2/2] Replace unsafe strnlen() with a bounds-checked loop that scans for \0 within the remaining array size. --- src/llama-vocab.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index c9be238326..ebd1700402 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1106,10 +1106,12 @@ private: throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); } const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset]; - // Use strnlen to safely bound the search within prefix_replacements size_t max_len = tokenizer.prefix_replacements_size - longest_prefix_offset; - size_t repl_len = strnlen(prefix_replacement, max_len); - if (repl_len == max_len && prefix_replacement[max_len - 1] != '\0') { + size_t repl_len = 0; + while (repl_len < max_len && prefix_replacement[repl_len] != '\0') { + repl_len++; + } + if (repl_len == max_len) { throw std::runtime_error("Unterminated string in precompiled charsmap!"); } return { prefix_replacement, repl_len, longest_prefix_length };