more fix, more tests

This commit is contained in:
Xuan Son Nguyen 2025-12-29 12:53:31 +01:00
parent 1cf25734a9
commit 026730e8e3
6 changed files with 106 additions and 27 deletions

View File

@ -1,4 +1,5 @@
#include "jinja-lexer.h"
#include "jinja-vm.h"
#include <vector>
#include <string>
@ -7,13 +8,73 @@
#include <stdexcept>
#include <cctype>
#include <functional>
#include <string_view>
// #define JJ_DEBUG(msg, ...) printf("jinja-lexer: " msg "\n", __VA_ARGS__)
#define JJ_DEBUG(msg, ...) // no-op
#define FILENAME "jinja-lexer"
namespace jinja {
// Trim template markers with '-' for whitespace control
// Example: [spaces]{%- ... -%} --> {% ... %}
#include <string>
#include <cctype>
static void trim_template_markers_inplace(std::string & s) {
// i = head ; j = tail (i <= j)
size_t j = 0; // Write pointer
const size_t len = s.length();
for (size_t i = 0; i < len; ) {
bool handled = false;
// We need at least 3 characters for any marker: {X- or -X}
if (i + 2 < len) {
const char c1 = s[i];
const char c2 = s[i + 1];
const char c3 = s[i + 2];
// 1. Closing trim: -X} where X = %, }, #
// Example: [content]-%} [spaces] -> [content]%}
if (c1 == '-' && c3 == '}' && (c2 == '%' || c2 == '}' || c2 == '#')) {
s[j++] = c2;
s[j++] = '}';
i += 3;
// Strip leading whitespace AFTER the tag
while (i < len && std::isspace(static_cast<unsigned char>(s[i]))) {
i++;
}
handled = true;
}
// 2. Opening trim: {X- where X = %, {, #
// Example: [spaces]{%- [content] -> {% [content]
else if (c1 == '{' && c3 == '-' && (c2 == '%' || c2 == '{' || c2 == '#')) {
// Trim trailing whitespace BEFORE the tag by moving write pointer back
while (j > 0 && std::isspace(static_cast<unsigned char>(s[j - 1]))) {
j--;
}
// Safety: Prevent merging '{' with tag start (avoid creating '{{%' or '{{{')
// if the character immediately before our new tag is a literal '{'.
if (j > 0 && s[j - 1] == '{') {
s[j++] = ' ';
}
s[j++] = '{';
s[j++] = c2;
i += 3;
handled = true;
}
}
if (!handled) {
// Note: j is always <= i here, so this is safe.
s[j++] = s[i++];
}
}
s.resize(j);
}
std::string lexer::preprocess(const std::string & template_str, const preprocess_options & options) const {
std::string result = template_str;
// According to https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control
@ -40,12 +101,7 @@ std::string lexer::preprocess(const std::string & template_str, const preprocess
}
// Handle whitespace control with - in tags
result = std::regex_replace(result, std::regex(R"(-%\}\s*)"), "%}");
result = std::regex_replace(result, std::regex(R"(\s*\{%-)"), "{%");
result = std::regex_replace(result, std::regex(R"(-\}\}\s*)"), "}}");
result = std::regex_replace(result, std::regex(R"(\s*\{\{-)"), "{{");
result = std::regex_replace(result, std::regex(R"(-#\}\s*)"), "#}");
result = std::regex_replace(result, std::regex(R"(\s*\{\#-)"), "{#");
trim_template_markers_inplace(result);
// Handle custom transformers-specific `generation` tag
// See https://github.com/huggingface/transformers/pull/30650 for more information.

View File

@ -26,8 +26,10 @@ class parser {
// for debugging; a token can be multiple chars in source
std::vector<size_t> tok_pos_to_src_pos;
std::string source; // for error reporting
public:
parser(const std::vector<token> & t) : tokens(t) {
parser(const std::vector<token> & t, const std::string & src) : tokens(t), source(src) {
tok_pos_to_src_pos.resize(tokens.size());
for (size_t i = 0; i < tokens.size(); i++) {
tok_pos_to_src_pos[i] = tokens[i].pos;
@ -46,7 +48,16 @@ public:
std::unique_ptr<T> mk_stmt(Args&&... args) {
auto ptr = std::make_unique<T>(std::forward<Args>(args)...);
ptr->pos = tok_pos_to_src_pos[prev_cur];
JJ_DEBUG("Created %s statement at src pos %zu", ptr->type().c_str(), ptr->pos);
std::string snippet = "no source";
if (!source.empty()) {
size_t start_pos = ptr->pos;
size_t end_pos = start_pos + 20;
if (end_pos > source.size()) end_pos = source.size();
snippet = source.substr(start_pos, end_pos - start_pos);
}
JJ_DEBUG("Created %-20s statement at src pos %-4zu (%s)", ptr->type().c_str(), ptr->pos, snippet.c_str());
return ptr;
}
@ -544,7 +555,9 @@ private:
return mk_stmt<integer_literal>(std::stoll(t.value));
case token::string_literal: {
std::string val = t.value;
while (is(token::string_literal)) val += tokens[current++].value;
while (is(token::string_literal)) {
val += tokens[current++].value;
}
return mk_stmt<string_literal>(val);
}
case token::identifier:
@ -575,13 +588,17 @@ private:
return mk_stmt<object_literal>(std::move(pairs));
}
default:
throw std::runtime_error("Unexpected token: " + t.value);
throw std::runtime_error("Unexpected token: " + t.value + " of type " + std::to_string(t.t));
}
}
};
program parse_from_tokens(const std::vector<token> & tokens) {
return parser(tokens).parse();
return parser(tokens, "").parse();
}
program parse_from_tokens(const lexer_result & lexer_res) {
return parser(lexer_res.tokens, lexer_res.preprocessed_source).parse();
}
} // namespace jinja

View File

@ -13,4 +13,6 @@ namespace jinja {
program parse_from_tokens(const std::vector<token> & tokens);
program parse_from_tokens(const lexer_result & lexer_res);
} // namespace jinja

View File

@ -131,23 +131,23 @@ const func_builtins & global_builtins() {
if (args.args.size() < 1 || args.args.size() > 3) {
throw raised_exception("slice() takes between 1 and 3 arguments");
}
int64_t arg0 = is_val<value_int>(args.args[0]) ? args.args[0]->as_int() : 0;
int64_t arg1 = is_val<value_int>(args.args[1]) ? args.args[1]->as_int() : -1;
int64_t arg2 = is_val<value_int>(args.args[2]) ? args.args[2]->as_int() : 1;
auto & arg0 = args.args[0];
auto & arg1 = args.args[1];
auto & arg2 = args.args[2];
int64_t start, stop, step;
if (args.args.size() == 1) {
start = 0;
stop = arg0;
stop = arg0->as_int();
step = 1;
} else if (args.args.size() == 2) {
start = arg0;
stop = arg1;
start = arg0->as_int();
stop = arg1->as_int();
step = 1;
} else {
start = arg0;
stop = arg1;
step = arg2;
start = arg0->as_int();
stop = arg1->as_int();
step = arg2->as_int();
}
auto out = mk_val<value_array>();

View File

@ -10,7 +10,7 @@
#include <memory>
#include <sstream>
#define JJ_DEBUG(msg, ...) if (g_jinja_debug) printf("%s:%3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)
#define JJ_DEBUG(msg, ...) if (g_jinja_debug) printf("%s:%-3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)
extern bool g_jinja_debug;

View File

@ -28,6 +28,8 @@ int main(void) {
std::vector<std::string> failed_tests;
bool stop_on_first_failure = false;
auto is_ignored_file = [](const std::string & filename) -> bool {
std::vector<std::string> ignored_files = {
"Apriel-",
@ -64,7 +66,9 @@ int main(void) {
std::cout << "Exception: " << e.what() << "\n";
std::cout << "=== ERROR WITH TEMPLATE FILE: " << entry.path().string() << " ===\n";
failed_tests.push_back(entry.path().string());
exit(1);
if (stop_on_first_failure) {
break;
}
}
}
}
@ -85,7 +89,7 @@ void run(std::string contents) {
jinja::lexer lexer;
jinja::preprocess_options options;
options.trim_blocks = true;
options.trim_blocks = false;
options.lstrip_blocks = false;
auto lexer_res = lexer.tokenize(contents, options);
for (const auto & tok : lexer_res.tokens) {
@ -93,7 +97,7 @@ void run(std::string contents) {
}
std::cout << "\n=== AST ===\n";
jinja::program ast = jinja::parse_from_tokens(lexer_res.tokens);
jinja::program ast = jinja::parse_from_tokens(lexer_res);
for (const auto & stmt : ast.body) {
//std::cout << "stmt type: " << stmt->type() << "\n";
}