From 0c0a0dcc888d580db07ff67881419b8713b05c1d Mon Sep 17 00:00:00 2001
From: hourhl <hourhl8200@gmail.com>
Date: Sun, 11 Jan 2026 16:31:53 +0800
Subject: [PATCH] fix: OOB reads in UGM tokenizer (precompiled_charsmap
 handling)

- Validate minimum size (4 bytes) before reading xcda_blob_size
- Use strnlen with bounds check instead of unsafe strlen

Both issues allow heap-buffer-overflow from malicious T5/UGM GGUF files.
---
 src/llama-vocab.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a20c6525e4..c9be238326 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -783,6 +783,9 @@ struct llm_tokenizer_ugm : llm_tokenizer {
 
             // First four bytes of precompiled_charsmap contains length of binary
             // blob containing XOR-compressed compact double array (XCDA) entries
+            if (precompiled_charsmap.size() < sizeof(uint32_t)) {
+                throw std::runtime_error("precompiled_charsmap too small for xcda_blob_size header!");
+            }
             uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
             charsmap_offset += sizeof(xcda_blob_size);
             if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
@@ -1103,7 +1106,13 @@ private:
                 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
             }
             const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
-            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+            // Use strnlen to safely bound the search within prefix_replacements
+            size_t max_len = tokenizer.prefix_replacements_size - longest_prefix_offset;
+            size_t repl_len = strnlen(prefix_replacement, max_len);
+            if (repl_len == max_len && prefix_replacement[max_len - 1] != '\0') {
+                throw std::runtime_error("Unterminated string in precompiled charsmap!");
+            }
+            return { prefix_replacement, repl_len, longest_prefix_length };
         }
 
         // check if the input prefix contains a valid sequence of UTF-8 code units