Merge 2162bec1fc into cc2aa81513
This commit is contained in:
commit
6cdedc79eb
|
|
@ -688,8 +688,51 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|||
// Filesystem utils
|
||||
//
|
||||
|
||||
// Validate if a filename is safe to use
|
||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||
// Normalizes a relative filepath
|
||||
// - Replaces backslashes and forward slashes with the system path separator
|
||||
// - Trims leading './' or '.\' segments
|
||||
// - Trims leading '/' or '\' (treat 'root' as relative)
|
||||
// - Trims duplicate directory separators
|
||||
// - Does not resolve '..' segments
|
||||
// - Does not ensure the path is valid or safe
|
||||
// Use in conjunction with `fs_validate_filename`, calling `fs_validate_filename` after `fs_normalize_filepath`
|
||||
std::string fs_normalize_filepath(const std::string & path) {
|
||||
std::string result;
|
||||
result.reserve(path.size());
|
||||
|
||||
bool leading = true;
|
||||
char prev = 0;
|
||||
for (size_t i = 0; i < path.size(); ++i) {
|
||||
char c = path[i];
|
||||
if (c == '/' || c == '\\') {
|
||||
c = DIRECTORY_SEPARATOR;
|
||||
}
|
||||
if (leading) {
|
||||
if (c == DIRECTORY_SEPARATOR) {
|
||||
continue; // Skip leading separators
|
||||
} else if (c == '.') {
|
||||
if (i + 1 < path.size()) {
|
||||
char next = path[i + 1];
|
||||
if (next == '/' || next == '\\') {
|
||||
++i;
|
||||
continue; // Skip leading dot segments
|
||||
}
|
||||
}
|
||||
}
|
||||
leading = false;
|
||||
}
|
||||
if (prev == DIRECTORY_SEPARATOR && c == DIRECTORY_SEPARATOR) {
|
||||
continue; // Skip duplicate separators
|
||||
}
|
||||
prev = c;
|
||||
result += c;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Validate if a filename or path is safe to use
|
||||
// Strictly rejects path traversal attempts, absolute paths, and reserved/illegal characters
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||
if (!filename.length()) {
|
||||
// Empty filename invalid
|
||||
|
|
@ -698,10 +741,13 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|||
if (filename.length() > 255) {
|
||||
// Limit at common largest possible filename on Linux filesystems
|
||||
// to avoid unnecessary further validation
|
||||
// NOTE: The 255 limit is per filename element on Linux, not the whole path
|
||||
// On Windows, the limit is commonly 260 for the whole absolute path
|
||||
// (On systems with smaller limits it will be caught by the OS)
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t prev = 0;
|
||||
size_t offset = 0;
|
||||
while (offset < filename.size()) {
|
||||
utf8_parse_result result = parse_utf8_codepoint(filename, offset);
|
||||
|
|
@ -711,6 +757,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|||
}
|
||||
uint32_t c = result.codepoint;
|
||||
|
||||
// Check for overlong UTF-8 sequences
|
||||
if ((result.bytes_consumed == 2 && c < 0x80) ||
|
||||
(result.bytes_consumed == 3 && c < 0x800) ||
|
||||
(result.bytes_consumed == 4 && c < 0x10000)) {
|
||||
|
|
@ -719,7 +766,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|||
|
||||
// Check for forbidden codepoints:
|
||||
// - Control characters
|
||||
// - Unicode equivalents of illegal characters
|
||||
// - Unicode equivalents of path traversal characters
|
||||
// - UTF-16 surrogate pairs
|
||||
// - UTF-8 replacement character
|
||||
// - Byte order mark (BOM)
|
||||
|
|
@ -728,8 +775,17 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|||
|| c == 0x7F // Control characters (DEL)
|
||||
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|
||||
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|
||||
|| c == 0x2215 // Division Slash (forward slash equivalent)
|
||||
|| c == 0x2216 // Set Minus (backslash equivalent)
|
||||
|| c == 0xFF0F // Fullwidth Solidus (forward slash equivalent, CP 874, 1250-1258)
|
||||
|| c == 0xFF3C // Fullwidth Reverse Solidus (backslash equivalent, CP 874, 1250-1258)
|
||||
|| c == 0xFF1A // Fullwidth Colon (colon equivalent, CP 874, 1250-1258)
|
||||
|| c == 0x2215 // Division Slash (forward slash equivalent, CP 1250, 1252, 1254)
|
||||
|| c == 0x2216 // Set Minus (backslash equivalent, CP 1250, 1252, 1254)
|
||||
|| c == 0x2044 // Fraction Slash (forward slash equivalent, CP 1250, 1252, 1254)
|
||||
|| c == 0x2236 // Ratio (colon equivalent, CP 1250, 1252, 1254)
|
||||
|| c == 0x0589 // Armenian Full Stop (colon equivalent, CP 1250, 1252, 1254)
|
||||
|| c == 0x00A5 // Yen Sign (backslash equivalent, CP 932 Japanese)
|
||||
|| c == 0x20A9 // Won Sign (backslash equivalent, CP 949, 1361 Korean)
|
||||
|| c == 0x00B4 // Acute Accent (forward slash equivalent, CP 1253 Greek)
|
||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||
|| c > 0x10FFFF // Max Unicode limit
|
||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||
|
|
@ -738,26 +794,33 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
|||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||
return false;
|
||||
}
|
||||
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||
if (allow_subdirs) {
|
||||
if ((prev == '.' || prev == ' ') && (c == '/' || c == '\\')) {
|
||||
// Reject any trailing dot or whitespace, these are stripped on Windows
|
||||
// This also matches path elements that equal '..' or '.'
|
||||
return false;
|
||||
}
|
||||
if ((prev == '/' || prev == '\\') && (c == ' ')) {
|
||||
// Reject any leading whitespace, these are stripped on Windows
|
||||
return false;
|
||||
}
|
||||
} else if (c == '/' || c == '\\') {
|
||||
// Subdirectories not allowed, reject path separators
|
||||
return false;
|
||||
}
|
||||
prev = c;
|
||||
offset += result.bytes_consumed;
|
||||
}
|
||||
|
||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||
// Unicode and other whitespace is not affected, only 0x20 space
|
||||
// This also matches paths that equal '..' or '.'
|
||||
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
|
||||
if (filename.find("..") != std::string::npos) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject "."
|
||||
if (filename == ".") {
|
||||
// Reject any leading path separators
|
||||
if (filename.front() == '/' || filename.front() == '\\') {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -707,6 +707,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|||
// Filesystem utils
|
||||
//
|
||||
|
||||
std::string fs_normalize_filepath(const std::string & path);
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
bool fs_is_directory(const std::string & path);
|
||||
|
|
|
|||
|
|
@ -227,6 +227,7 @@ llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "
|
|||
set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
|
||||
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
llama_build_and_test(test-fs-validate-filename.cpp)
|
||||
|
||||
if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
|
||||
# TODO: repair known memory leaks
|
||||
|
|
|
|||
|
|
@ -0,0 +1,213 @@
|
|||
#include "common.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
|
||||
static int n_tests = 0;
|
||||
static int n_failed = 0;
|
||||
|
||||
static const char SEP = DIRECTORY_SEPARATOR;
|
||||
|
||||
static void test_normalize(const char * desc, const std::string & expected, const std::string & input) {
|
||||
std::string result = fs_normalize_filepath(input);
|
||||
n_tests++;
|
||||
if (result != expected) {
|
||||
n_failed++;
|
||||
printf(" FAIL: %s (got \"%s\", expected \"%s\")\n", desc, result.c_str(), expected.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
static void test(const char * desc, bool expected, const std::string & filename, bool allow_subdirs = false) {
|
||||
bool result = fs_validate_filename(filename, allow_subdirs);
|
||||
n_tests++;
|
||||
if (result != expected) {
|
||||
n_failed++;
|
||||
printf(" FAIL: %s (got %s, expected %s)\n", desc,
|
||||
result ? "true" : "false", expected ? "true" : "false");
|
||||
}
|
||||
}
|
||||
|
||||
static void test_combined(const char * desc, bool expected, const std::string & path) {
|
||||
test(desc, expected, fs_normalize_filepath(path), true);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
// --- Basic valid filenames ---
|
||||
test("simple ascii", true, "hello.txt");
|
||||
test("no extension", true, "readme");
|
||||
test("multiple dots", true, "archive.tar.gz");
|
||||
test("leading dot (hidden)", true, ".gitignore");
|
||||
test("unicode filename", true, "\xc3\xa9\xc3\xa0\xc3\xbc.txt"); // éàü.txt
|
||||
test("precomposed accent", true, "caf\xc3\xa9"); // café (U+00E9)
|
||||
test("combining accent", true, "cafe\xcc\x81"); // café (e + U+0301)
|
||||
test("japanese hiragana", true, "\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86.txt"); // あいう.txt
|
||||
test("korean hangul", true, "\xed\x95\x9c\xea\xb8\x80.txt"); // 한글.txt
|
||||
test("max length (255 bytes)", true, std::string(255, 'a'));
|
||||
|
||||
// --- Basic invalid filenames ---
|
||||
test("empty string", false, "");
|
||||
test("over 255 bytes", false, std::string(256, 'a'));
|
||||
test("just a dot", false, ".");
|
||||
test("double dot", false, "..");
|
||||
test("leading space", false, " foo");
|
||||
test("trailing space", false, "foo ");
|
||||
test("trailing dot", false, "foo.");
|
||||
test("dot path", false, "./././");
|
||||
|
||||
// --- Double dots ---
|
||||
test("contains double dot", true, "foo..bar");
|
||||
test("leading double dot", true, "..foo");
|
||||
test("trailing double dot", false, "foo.."); // trailing dot
|
||||
|
||||
// --- Control characters ---
|
||||
test("null byte", false, std::string("foo\x00""bar", 7));
|
||||
test("newline", false, "foo\nbar");
|
||||
test("tab", false, "foo\tbar");
|
||||
test("C0 control (0x01)", false, "foo\x01""bar");
|
||||
test("DEL (0x7F)", false, "foo\x7f""bar");
|
||||
test("C1 control (0x80)", false, "foo\xc2\x80""bar"); // U+0080
|
||||
test("C1 control (0x9F)", false, "foo\xc2\x9f""bar"); // U+009F
|
||||
|
||||
// --- Illegal characters ---
|
||||
test("colon", false, "foo:bar");
|
||||
test("asterisk", false, "foo*bar");
|
||||
test("question mark", false, "foo?bar");
|
||||
test("double quote", false, "foo\"bar");
|
||||
test("less than", false, "foo<bar");
|
||||
test("greater than", false, "foo>bar");
|
||||
test("pipe", false, "foo|bar");
|
||||
test("forward slash", false, "foo/bar");
|
||||
test("backslash", false, "foo\\bar");
|
||||
|
||||
// --- Unicode special codepoints ---
|
||||
test("fullwidth period U+FF0E", false, "foo\xef\xbc\x8e""bar");
|
||||
test("replacement char U+FFFD", false, "foo\xef\xbf\xbd""bar");
|
||||
test("BOM U+FEFF", false, "foo\xef\xbb\xbf""bar");
|
||||
|
||||
// --- Windows bestfit characters (map to path traversal chars under WideCharToMultiByte) ---
|
||||
test("fullwidth solidus U+FF0F", false, "foo\xef\xbc\x8f""bar"); // / on CP 874, 1250-1258
|
||||
test("fullwidth rev solidus U+FF3C",false, "foo\xef\xbc\xbc""bar"); // \ on CP 874, 1250-1258
|
||||
test("fullwidth colon U+FF1A", false, "foo\xef\xbc\x9a""bar"); // : on CP 874, 1250-1258
|
||||
test("division slash U+2215", false, "foo\xe2\x88\x95""bar"); // / on CP 1250, 1252, 1254
|
||||
test("set minus U+2216", false, "foo\xe2\x88\x96""bar"); // \ on CP 1250, 1252, 1254
|
||||
test("fraction slash U+2044", false, "foo\xe2\x81\x84""bar"); // / on CP 1250, 1252, 1254
|
||||
test("ratio U+2236", false, "foo\xe2\x88\xb6""bar"); // : on CP 1250, 1252, 1254
|
||||
test("armenian full stop U+0589", false, "foo\xd6\x89""bar"); // : on CP 1250, 1252, 1254
|
||||
test("yen sign U+00A5", false, "foo\xc2\xa5""bar"); // \ on CP 932 (Japanese)
|
||||
test("won sign U+20A9", false, "foo\xe2\x82\xa9""bar"); // \ on CP 949 (Korean)
|
||||
test("acute accent U+00B4", false, "foo\xc2\xb4""bar"); // / on CP 1253 (Greek)
|
||||
|
||||
// --- Invalid UTF-8 ---
|
||||
test("invalid continuation", false, "foo\x80""bar");
|
||||
test("truncated sequence", false, "foo\xc3");
|
||||
test("overlong slash (2-byte)", false, "foo\xc0\xaf""bar"); // U+002F as 2-byte
|
||||
test("overlong dot (2-byte)", false, "foo\xc0\xae""bar"); // U+002E as 2-byte
|
||||
test("overlong 'a' (2-byte)", false, "foo\xc1\xa1""bar"); // U+0061 as 2-byte
|
||||
test("overlong 'A' (2-byte)", false, "foo\xc1\x81""bar"); // U+0041 as 2-byte
|
||||
test("overlong null (2-byte)", false, "foo\xc0\x80""bar"); // U+0000 as 2-byte
|
||||
|
||||
// --- Paths without allow_subdirs ---
|
||||
test("forward slash blocked", false, "foo/bar");
|
||||
test("backslash blocked", false, "foo\\bar");
|
||||
|
||||
// --- Paths with allow_subdirs=true ---
|
||||
test("simple subdir", true, "foo/bar", true);
|
||||
test("backslash subdir", true, "foo\\bar", true);
|
||||
test("deep path", true, "a/b/c/d.txt", true);
|
||||
test("trailing slash", true, "foo/bar/", true);
|
||||
test("colon in path", false, "foo/b:r/baz", true);
|
||||
test("control char in path", false, "foo/b\nar/baz", true);
|
||||
test("dot path", false, "./././", true);
|
||||
|
||||
// --- Leading separators ---
|
||||
test("leading slash", false, "/foo/bar", true);
|
||||
test("leading backslash", false, "\\foo\\bar", true);
|
||||
|
||||
// --- Dotdot in paths ---
|
||||
test("leading dotdot in path", false, "../bar", true);
|
||||
test("dotdot in path", false, "foo/../bar", true);
|
||||
test("dotdot component leading", true, "foo/..bar/baz", true);
|
||||
test("dotdot component middle", true, "foo/ba..r/baz", true);
|
||||
test("dotdot component trailing", false, "foo/bar../baz", true); // trailing dot
|
||||
|
||||
// --- Per-component checks ---
|
||||
test("leading space in component", false, "foo/ bar/baz", true);
|
||||
test("trailing space in component", false, "foo/bar /baz", true);
|
||||
test("trailing dot in component", false, "foo/bar./baz", true);
|
||||
test("dot component in path", false, "foo/./bar", true);
|
||||
test("leading space after slash", false, "foo/ bar", true);
|
||||
test("trailing space before slash", false, "bar /baz", true);
|
||||
test("trailing dot before slash", false, "bar./baz", true);
|
||||
|
||||
// --- Simple filename tests ---
|
||||
test("simple binary file", true, "file.bin");
|
||||
test("japanese filename", true, u8"日本式ファイルの芸術.bin");
|
||||
|
||||
// --- Path traversal (no subdirs) ---
|
||||
test("dotdot slash", false, "../bad.bin");
|
||||
test("dotdot backslash", false, "..\\bad.bin");
|
||||
test("subdir dotdot backslash", false, "also/..\\bad.bin");
|
||||
test("subdir slash", false, "also/bad.bin");
|
||||
|
||||
// --- Unicode path equivalents ---
|
||||
test("division slash U+2215", false, "unicode\xe2\x88\x95""bad.bin");
|
||||
test("set minus U+2216", false, "unicode\xe2\x88\x96""bad.bin");
|
||||
test("fullwidth period U+FF0E", false, "unicode\xef\xbc\x8e""bad.bin");
|
||||
|
||||
// --- Overlong encoding ---
|
||||
test("overlong 0xC0 0x2E", false, "overlong\xc0\x2e""bad.bin");
|
||||
test("overlong 0xE0 0x40 0xAE", false, "overlong\xe0\x40\xae""bad.bin");
|
||||
test("overlong dot (2-byte)", false, "overlong\xc0\xae""bad.bin");
|
||||
test("overlong slash (2-byte)", false, "overlong\xc0\xaf""bad.bin");
|
||||
test("overlong slash (3-byte)", false, "overlong\xe0\x80\xaf""bad.bin");
|
||||
test("overlong 0xC0 0x2F", false, "overlong\xc0\x2f""bad.bin");
|
||||
test("overlong 0xC0 0x5C", false, "overlong\xc0\x5c""bad.bin");
|
||||
test("overlong 0xC0 0x80 0x5C", false, "overlong\xc0\x80\x5c""bad.bin");
|
||||
|
||||
// --- fs_normalize_filepath ---
|
||||
test_normalize("passthrough simple", "foo.txt", "foo.txt");
|
||||
test_normalize("passthrough subdir", std::string("foo") + SEP + "bar.txt", "foo/bar.txt");
|
||||
test_normalize("backslash to sep", std::string("foo") + SEP + "bar.txt", "foo\\bar.txt");
|
||||
test_normalize("mixed separators", std::string("a") + SEP + "b" + SEP + "c", "a/b\\c");
|
||||
test_normalize("duplicate slashes", std::string("foo") + SEP + "bar", "foo//bar");
|
||||
test_normalize("duplicate backslashes", std::string("foo") + SEP + "bar", "foo\\\\bar");
|
||||
test_normalize("triple slashes", std::string("foo") + SEP + "bar", "foo///bar");
|
||||
test_normalize("leading slash stripped", "foo", "/foo");
|
||||
test_normalize("leading backslash stripped", "foo", "\\foo");
|
||||
test_normalize("multiple leading slashes", "foo", "///foo");
|
||||
test_normalize("leading dot-slash stripped", "foo", "./foo");
|
||||
test_normalize("leading dot-backslash stripped", "foo", ".\\foo");
|
||||
test_normalize("deep path normalized",
|
||||
std::string("a") + SEP + "b" + SEP + "c" + SEP + "d.txt",
|
||||
"/a//b\\c/d.txt");
|
||||
|
||||
// --- normalize doesn't validate and doesn't trim dot segments in the middle of the path ---
|
||||
test_normalize("dotdot retained", std::string("foo") + SEP + ".." + SEP + "bar", "foo/../bar");
|
||||
test_normalize("dotdot at start retained", std::string("..") + SEP + "bar", "../bar");
|
||||
test_normalize("dotdot at end retained", std::string("foo") + SEP + "..", "foo/..");
|
||||
test_normalize("dot component retained mid", std::string("foo") + SEP + "." + SEP + "bar", "foo/./bar");
|
||||
|
||||
// --- combined tests validating normalized paths ---
|
||||
test_combined("absolute windows path", false, "C:\\Tools\\secrets.txt"); // absolute path
|
||||
test_combined("root is relative", true, "/meow/image.jpg"); // root separators are normalized to relative
|
||||
test_combined("relative dot path", true, "././meow/image.jpg"); // ok because no effect
|
||||
test_combined("inner dot path", false, "././meow/./image.jpg"); // blocked because plausibly a downstream traversal attempt
|
||||
test_combined("double dot path", false, "../meow/image.jpg"); // direct traversal attempt
|
||||
test_combined("mid double dot path", false, "meow/../image.jpg"); // technically a subpath but plausibly a downstream traversal attempt
|
||||
test_combined("end double dot path", false, "meow/.."); // blank path
|
||||
test_combined("triple single dot root", false, "./././"); // blank path
|
||||
test_combined("triple single dot file", true, "./././image.jpg"); // weird but okay
|
||||
|
||||
if (n_failed) {
|
||||
printf("\n%d/%d tests failed\n", n_failed, n_tests);
|
||||
fflush(stdout);
|
||||
assert(false);
|
||||
}
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -798,6 +798,7 @@ static void handle_media(
|
|||
}
|
||||
// load local image file
|
||||
std::string file_path = url.substr(7); // remove "file://"
|
||||
file_path = fs_normalize_filepath(file_path); // remove any leading './' and normalize separators
|
||||
raw_buffer data;
|
||||
if (!fs_validate_filename(file_path, true)) {
|
||||
throw std::invalid_argument("file path is not allowed: " + file_path);
|
||||
|
|
|
|||
Loading…
Reference in New Issue