more fix, more tests

2025-12-29 12:53:31 +01:00 · 2025-12-29 12:53:31 +01:00 · 026730e8e3
parent 1cf25734a9
commit 026730e8e3
6 changed files with 106 additions and 27 deletions
--- a/common/jinja/jinja-lexer.cpp
+++ b/common/jinja/jinja-lexer.cpp
@ -1,4 +1,5 @@
 #include "jinja-lexer.h"
+#include "jinja-vm.h"

 #include <vector>
 #include <string>
@ -7,13 +8,73 @@
 #include <stdexcept>
 #include <cctype>
 #include <functional>
+#include <string_view>

-
-// #define JJ_DEBUG(msg, ...)  printf("jinja-lexer: " msg "\n", __VA_ARGS__)
-#define JJ_DEBUG(msg, ...)  // no-op
+#define FILENAME "jinja-lexer"

 namespace jinja {

+// Trim template markers with '-' for whitespace control
+// Example: [spaces]{%- ... -%} --> {% ... %}
+#include <string>
+#include <cctype>
+
+static void trim_template_markers_inplace(std::string & s) {
+    // i = head ; j = tail (i <= j)
+    size_t j = 0; // Write pointer
+    const size_t len = s.length();
+    
+    for (size_t i = 0; i < len; ) {
+        bool handled = false;
+
+        // We need at least 3 characters for any marker: {X- or -X}
+        if (i + 2 < len) {
+            const char c1 = s[i];
+            const char c2 = s[i + 1];
+            const char c3 = s[i + 2];
+
+            // 1. Closing trim: -X} where X = %, }, #
+            // Example: [content]-%} [spaces] -> [content]%}
+            if (c1 == '-' && c3 == '}' && (c2 == '%' || c2 == '}' || c2 == '#')) {
+                s[j++] = c2;
+                s[j++] = '}';
+                i += 3;
+                // Strip leading whitespace AFTER the tag
+                while (i < len && std::isspace(static_cast<unsigned char>(s[i]))) {
+                    i++;
+                }
+                handled = true;
+            }
+            // 2. Opening trim: {X- where X = %, {, #
+            // Example: [spaces]{%- [content] -> {% [content]
+            else if (c1 == '{' && c3 == '-' && (c2 == '%' || c2 == '{' || c2 == '#')) {
+                // Trim trailing whitespace BEFORE the tag by moving write pointer back
+                while (j > 0 && std::isspace(static_cast<unsigned char>(s[j - 1]))) {
+                    j--;
+                }
+
+                // Safety: Prevent merging '{' with tag start (avoid creating '{{%' or '{{{')
+                // if the character immediately before our new tag is a literal '{'.
+                if (j > 0 && s[j - 1] == '{') {
+                    s[j++] = ' ';
+                }
+
+                s[j++] = '{';
+                s[j++] = c2;
+                i += 3;
+                handled = true;
+            }
+        }
+
+        if (!handled) {
+            // Note: j is always <= i here, so this is safe.
+            s[j++] = s[i++];
+        }
+    }
+
+    s.resize(j);
+}
+
 std::string lexer::preprocess(const std::string & template_str, const preprocess_options & options) const {
    std::string result = template_str;
    // According to https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control
@ -40,12 +101,7 @@ std::string lexer::preprocess(const std::string & template_str, const preprocess
    }

    // Handle whitespace control with - in tags
-    result = std::regex_replace(result, std::regex(R"(-%\}\s*)"), "%}");
-    result = std::regex_replace(result, std::regex(R"(\s*\{%-)"), "{%");
-    result = std::regex_replace(result, std::regex(R"(-\}\}\s*)"), "}}");
-    result = std::regex_replace(result, std::regex(R"(\s*\{\{-)"), "{{");
-    result = std::regex_replace(result, std::regex(R"(-#\}\s*)"), "#}");
-    result = std::regex_replace(result, std::regex(R"(\s*\{\#-)"), "{#");
+    trim_template_markers_inplace(result);

    // Handle custom transformers-specific `generation` tag
    // See https://github.com/huggingface/transformers/pull/30650 for more information.
--- a/common/jinja/jinja-parser.cpp
+++ b/common/jinja/jinja-parser.cpp
@ -26,8 +26,10 @@ class parser {
    // for debugging; a token can be multiple chars in source
    std::vector<size_t> tok_pos_to_src_pos;

+    std::string source; // for error reporting
+
 public:
-    parser(const std::vector<token> & t) : tokens(t) {
+    parser(const std::vector<token> & t, const std::string & src) : tokens(t), source(src) {
        tok_pos_to_src_pos.resize(tokens.size());
        for (size_t i = 0; i < tokens.size(); i++) {
            tok_pos_to_src_pos[i] = tokens[i].pos;
@ -46,7 +48,16 @@ public:
    std::unique_ptr<T> mk_stmt(Args&&... args) {
        auto ptr = std::make_unique<T>(std::forward<Args>(args)...);
        ptr->pos = tok_pos_to_src_pos[prev_cur];
-        JJ_DEBUG("Created %s statement at src pos %zu", ptr->type().c_str(), ptr->pos);
+
+        std::string snippet = "no source";
+        if (!source.empty()) {
+            size_t start_pos = ptr->pos;
+            size_t end_pos = start_pos + 20;
+            if (end_pos > source.size()) end_pos = source.size();
+            snippet = source.substr(start_pos, end_pos - start_pos);
+        }
+        JJ_DEBUG("Created %-20s statement at src pos %-4zu (%s)", ptr->type().c_str(), ptr->pos, snippet.c_str());
+
        return ptr;
    }

@ -544,7 +555,9 @@ private:
                return mk_stmt<integer_literal>(std::stoll(t.value));
            case token::string_literal: {
                std::string val = t.value;
-                while (is(token::string_literal)) val += tokens[current++].value;
+                while (is(token::string_literal)) {
+                    val += tokens[current++].value;
+                }
                return mk_stmt<string_literal>(val);
            }
            case token::identifier:
@ -575,13 +588,17 @@ private:
                return mk_stmt<object_literal>(std::move(pairs));
            }
            default:
-                throw std::runtime_error("Unexpected token: " + t.value);
+                throw std::runtime_error("Unexpected token: " + t.value + " of type " + std::to_string(t.t));
        }
    }
 };

 program parse_from_tokens(const std::vector<token> & tokens) {
-    return parser(tokens).parse();
+    return parser(tokens, "").parse();
+}
+
+program parse_from_tokens(const lexer_result & lexer_res) {
+    return parser(lexer_res.tokens, lexer_res.preprocessed_source).parse();
 }

 } // namespace jinja
--- a/common/jinja/jinja-parser.h
+++ b/common/jinja/jinja-parser.h
@ -13,4 +13,6 @@ namespace jinja {

 program parse_from_tokens(const std::vector<token> & tokens);

+program parse_from_tokens(const lexer_result & lexer_res);
+
 } // namespace jinja
--- a/common/jinja/jinja-value.cpp
+++ b/common/jinja/jinja-value.cpp
@ -131,23 +131,23 @@ const func_builtins & global_builtins() {
            if (args.args.size() < 1 || args.args.size() > 3) {
                throw raised_exception("slice() takes between 1 and 3 arguments");
            }
-            int64_t arg0 = is_val<value_int>(args.args[0]) ? args.args[0]->as_int() : 0;
-            int64_t arg1 = is_val<value_int>(args.args[1]) ? args.args[1]->as_int() : -1;
-            int64_t arg2 = is_val<value_int>(args.args[2]) ? args.args[2]->as_int() : 1;
+            auto & arg0 = args.args[0];
+            auto & arg1 = args.args[1];
+            auto & arg2 = args.args[2];

            int64_t start, stop, step;
            if (args.args.size() == 1) {
                start = 0;
-                stop = arg0;
+                stop = arg0->as_int();
                step = 1;
            } else if (args.args.size() == 2) {
-                start = arg0;
-                stop = arg1;
+                start = arg0->as_int();
+                stop = arg1->as_int();
                step = 1;
            } else {
-                start = arg0;
-                stop = arg1;
-                step = arg2;
+                start = arg0->as_int();
+                stop = arg1->as_int();
+                step = arg2->as_int();
            }

            auto out = mk_val<value_array>();
--- a/common/jinja/jinja-vm.h
+++ b/common/jinja/jinja-vm.h
@ -10,7 +10,7 @@
 #include <memory>
 #include <sstream>

-#define JJ_DEBUG(msg, ...)  if (g_jinja_debug) printf("%s:%3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)
+#define JJ_DEBUG(msg, ...)  if (g_jinja_debug) printf("%s:%-3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)

 extern bool g_jinja_debug;

--- a/tests/test-chat-jinja.cpp
+++ b/tests/test-chat-jinja.cpp
@ -28,6 +28,8 @@ int main(void) {

    std::vector<std::string> failed_tests;

+    bool stop_on_first_failure = false;
+
    auto is_ignored_file = [](const std::string & filename) -> bool {
        std::vector<std::string> ignored_files = {
            "Apriel-",
@ -64,7 +66,9 @@ int main(void) {
                std::cout << "Exception: " << e.what() << "\n";
                std::cout << "=== ERROR WITH TEMPLATE FILE: " << entry.path().string() << " ===\n";
                failed_tests.push_back(entry.path().string());
-                exit(1);
+                if (stop_on_first_failure) {
+                    break;
+                }
            }
        }
    }
@ -85,7 +89,7 @@ void run(std::string contents) {

    jinja::lexer lexer;
    jinja::preprocess_options options;
-    options.trim_blocks = true;
+    options.trim_blocks = false;
    options.lstrip_blocks = false;
    auto lexer_res = lexer.tokenize(contents, options);
    for (const auto & tok : lexer_res.tokens) {
@ -93,7 +97,7 @@ void run(std::string contents) {
    }

    std::cout << "\n=== AST ===\n";
-    jinja::program ast = jinja::parse_from_tokens(lexer_res.tokens);
+    jinja::program ast = jinja::parse_from_tokens(lexer_res);
    for (const auto & stmt : ast.body) {
        //std::cout << "stmt type: " << stmt->type() << "\n";
    }