From 243532e5568704ced8b08136e70247f9ceecdbf7 Mon Sep 17 00:00:00 2001 From: Kwa Jie Hao <31984694+kwajiehao@users.noreply.github.com> Date: Thu, 9 Apr 2026 17:28:33 +0800 Subject: [PATCH] jinja : support ensure_ascii=true, string repetition and int/float self-filtering (#21623) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: jinja engine improvements for reka-edge Port three Jinja engine improvements needed for the reka-edge model: 1. Python-style string repetition ("ab" * 3 → "ababab") 2. ensure_ascii=true support for tojson filter (escapes non-ASCII to \uXXXX) 3. int() builtin on value_int_t (identity, needed for Reka Edge template) * fix: escape invalid utf8 bytes when ensure_ascii=true The json_ensure_ascii_preserving_format function does not correctly handle an edge case where if UTF-8 parsing fails, it adds the non-ascii character back to the output as a raw byte. This commit fixes that by adding the unicode standard replacement character \\ufffd to the output instead. This is the standard behavior for various programming languages like Python, Rust, Go, etc. * chore: address PR comments 1. Add todo comment for supporting string repetition for array/tuples 2. Add support for float identity operation 3. Move invalid ascii test case to test_fuzzing * chore: accept suggestion for common/jinja/value.cpp Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- common/jinja/runtime.cpp | 17 ++++++++ common/jinja/value.cpp | 93 ++++++++++++++++++++++++++++++++++++++-- tests/test-jinja.cpp | 53 +++++++++++++++++++++++ 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index 5b51427aa0..f81d98d954 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -251,6 +251,23 @@ value binary_expression::execute_impl(context & ctx) { return res; } + // Python-style string repetition + // TODO: support array/tuple repetition (e.g., [1, 2] * 3 → [1, 2, 1, 2, 1, 2]) + if (op.value == "*" && + ((is_val(left_val) && is_val(right_val)) || + (is_val(left_val) && is_val(right_val)))) { + const auto & str = is_val(left_val) ? left_val->as_string() : right_val->as_string(); + const int64_t repeat = is_val(right_val) ? right_val->as_int() : left_val->as_int(); + auto res = mk_val(); + if (repeat <= 0) { + return res; + } + for (int64_t i = 0; i < repeat; ++i) { + res->val_str = res->val_str.append(str); + } + return res; + } + // String membership if (is_val(left_val) && is_val(right_val)) { // case: "a" in "abc" diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 7dc1d65407..8e86a715f5 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -1,4 +1,5 @@ #include "runtime.h" +#include "unicode.h" #include "value.h" // for converting from JSON to jinja values @@ -154,6 +155,83 @@ static value test_compare_fn(const func_args & args) { return mk_val(value_compare(args.get_pos(0), args.get_pos(1), op)); } +static void append_codepoint_as_ascii_json_escape(std::string & out, uint32_t codepoint) { + auto append_u16 = [&out](uint32_t value) { + char buf[8]; + snprintf(buf, sizeof(buf), "\\u%04x", static_cast(value)); + out += buf; + }; + + if (codepoint <= 0xFFFF) { + append_u16(codepoint); + return; + } + + codepoint -= 0x10000; + append_u16(0xD800 + ((codepoint >> 10) & 0x3FF)); + append_u16(0xDC00 + (codepoint & 0x3FF)); +} + +static std::string json_ensure_ascii_preserving_format(const std::string & json_str) { + std::string output; + output.reserve(json_str.size()); + + bool in_string = false; + bool escaped = false; + + for (size_t pos = 0; pos < json_str.size();) { + const char ch = json_str[pos]; + if (!in_string) { + output.push_back(ch); + if (ch == '"') { + in_string = true; + } + ++pos; + continue; + } + + if (escaped) { + output.push_back(ch); + escaped = false; + ++pos; + continue; + } + + if (ch == '\\') { + output.push_back(ch); + escaped = true; + ++pos; + continue; + } + + if (ch == '"') { + output.push_back(ch); + in_string = false; + ++pos; + continue; + } + + const unsigned char uch = static_cast(ch); + if (uch < 0x80) { + output.push_back(ch); + ++pos; + continue; + } + + auto parsed = common_parse_utf8_codepoint(json_str, pos); + if (parsed.status != utf8_parse_result::SUCCESS) { + output += "\\ufffd"; + ++pos; + continue; + } + + append_codepoint_as_ascii_json_escape(output, parsed.codepoint); + pos += parsed.bytes_consumed; + } + + return output; +} + static value tojson(const func_args & args) { args.ensure_count(1, 5); value val_ascii = args.get_kwarg_or_pos("ensure_ascii", 1); @@ -169,16 +247,17 @@ static value tojson(const func_args & args) { if (is_val(val_indent)) { indent = static_cast(val_indent->as_int()); } - if (val_ascii->as_bool()) { // undefined == false - throw not_implemented_exception("tojson ensure_ascii=true not implemented"); - } if (val_sort->as_bool()) { // undefined == false throw not_implemented_exception("tojson sort_keys=true not implemented"); } + const bool ensure_ascii = val_ascii->as_bool(); // undefined == false auto separators = (is_val(val_separators) ? val_separators : mk_val())->as_array(); std::string item_sep = separators.size() > 0 ? separators[0]->as_string().str() : (indent < 0 ? ", " : ","); std::string key_sep = separators.size() > 1 ? separators[1]->as_string().str() : ": "; std::string json_str = value_to_json(args.get_pos(0), indent, item_sep, key_sep); + if (ensure_ascii) { + json_str = json_ensure_ascii_preserving_format(json_str); + } return mk_val(json_str); } @@ -460,6 +539,10 @@ const func_builtins & value_int_t::get_builtins() const { int64_t val = args.get_pos(0)->as_int(); return mk_val(val < 0 ? -val : val); }}, + {"int", [](const func_args & args) -> value { + args.ensure_vals(); + return mk_val(args.get_pos(0)->as_int()); + }}, {"float", [](const func_args & args) -> value { args.ensure_vals(); double val = static_cast(args.get_pos(0)->as_int()); @@ -486,6 +569,10 @@ const func_builtins & value_float_t::get_builtins() const { int64_t val = static_cast(args.get_pos(0)->as_float()); return mk_val(val); }}, + {"float", [](const func_args & args) -> value { + args.ensure_vals(); + return mk_val(args.get_pos(0)->as_float()); + }}, {"safe", tojson}, {"string", tojson}, {"tojson", tojson}, diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index ce3008f4c7..b5ee53461e 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -447,6 +447,18 @@ static void test_expressions(testing & t) { "hello world" ); + test_template(t, "string repetition", + "{{ 'ab' * 3 }}", + json::object(), + "ababab" + ); + + test_template(t, "reversed string repetition", + "{{ 3 * 'ab' }}", + json::object(), + "ababab" + ); + test_template(t, "ternary", "{{ 'yes' if cond else 'no' }}", {{"cond", true}}, @@ -693,6 +705,33 @@ static void test_filters(testing & t) { "\"\\u2713\"" ); + test_template(t, "tojson ensure_ascii=true nested object", + "{{ data|tojson(ensure_ascii=true) }}", + {{"data", { + {"text", "\u2713"}, + {"items", json::array({"é", {{"snowman", "☃"}}})} + }}}, + "{\"text\": \"\\u2713\", \"items\": [\"\\u00e9\", {\"snowman\": \"\\u2603\"}]}" + ); + + test_template(t, "tojson ensure_ascii=true indent=2", + "{{ data|tojson(ensure_ascii=true, indent=2) }}", + {{"data", { + {"text", "\u2713"}, + {"nested", {{"accent", "é"}}} + }}}, + "{\n \"text\": \"\\u2713\",\n \"nested\": {\n \"accent\": \"\\u00e9\"\n }\n}" + ); + + test_template(t, "tojson ensure_ascii=true preserves existing escapes", + "{{ data|tojson(ensure_ascii=true) }}", + {{"data", { + {"emoji", "😀"}, + {"line", "a\nb"} + }}}, + "{\"emoji\": \"\\ud83d\\ude00\", \"line\": \"a\\nb\"}" + ); + test_template(t, "tojson sort_keys=true", "{{ data|tojson(sort_keys=true) }}", {{"data", {{"b", 2}, {"a", 1}}}}, @@ -771,6 +810,12 @@ static void test_filters(testing & t) { "hello" ); + test_template(t, "int filter on integer is identity", + "{{ value|int }}", + {{"value", 7}}, + "7" + ); + test_template(t, "none to string", "{{ x|string }}", {{"x", nullptr}}, @@ -2458,4 +2503,12 @@ static void test_fuzzing(testing & t) { t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars)); } }); + + t.test("tojson ensure_ascii=true with invalid utf-8", [&](testing & t) { + t.assert_true("invalid utf-8 does not crash", + fuzz_test_template( + "{{ data|tojson(ensure_ascii=true) }}", + {{"data", std::string("hello\xfe\xffworld")}} + )); + }); }