fix: OOB reads in UGM tokenizer (precompiled_charsmap handling)
- Validate minimum size (4 bytes) before reading xcda_blob_size - Use strnlen with bounds check instead of unsafe strlen Both issues allow heap-buffer-overflow from malicious T5/UGM GGUF files.
This commit is contained in:
parent
707cbafcaa
commit
0c0a0dcc88
|
|
@ -783,6 +783,9 @@ struct llm_tokenizer_ugm : llm_tokenizer {
|
|||
|
||||
// First four bytes of precompiled_charsmap contains length of binary
|
||||
// blob containing XOR-compressed compact double array (XCDA) entries
|
||||
if (precompiled_charsmap.size() < sizeof(uint32_t)) {
|
||||
throw std::runtime_error("precompiled_charsmap too small for xcda_blob_size header!");
|
||||
}
|
||||
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
|
||||
charsmap_offset += sizeof(xcda_blob_size);
|
||||
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
|
||||
|
|
@ -1103,7 +1106,13 @@ private:
|
|||
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
||||
}
|
||||
const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
|
||||
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
|
||||
// Use strnlen to safely bound the search within prefix_replacements
|
||||
size_t max_len = tokenizer.prefix_replacements_size - longest_prefix_offset;
|
||||
size_t repl_len = strnlen(prefix_replacement, max_len);
|
||||
if (repl_len == max_len && prefix_replacement[max_len - 1] != '\0') {
|
||||
throw std::runtime_error("Unterminated string in precompiled charsmap!");
|
||||
}
|
||||
return { prefix_replacement, repl_len, longest_prefix_length };
|
||||
}
|
||||
|
||||
// check if the input prefix contains a valid sequence of UTF-8 code units
|
||||
|
|
|
|||
Loading…
Reference in New Issue