more fix, more tests
This commit is contained in:
parent
1cf25734a9
commit
026730e8e3
|
|
@ -1,4 +1,5 @@
|
|||
#include "jinja-lexer.h"
|
||||
#include "jinja-vm.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
|
@ -7,13 +8,73 @@
|
|||
#include <stdexcept>
|
||||
#include <cctype>
|
||||
#include <functional>
|
||||
#include <string_view>
|
||||
|
||||
|
||||
// #define JJ_DEBUG(msg, ...) printf("jinja-lexer: " msg "\n", __VA_ARGS__)
|
||||
#define JJ_DEBUG(msg, ...) // no-op
|
||||
#define FILENAME "jinja-lexer"
|
||||
|
||||
namespace jinja {
|
||||
|
||||
// Trim template markers with '-' for whitespace control
|
||||
// Example: [spaces]{%- ... -%} --> {% ... %}
|
||||
#include <string>
|
||||
#include <cctype>
|
||||
|
||||
static void trim_template_markers_inplace(std::string & s) {
|
||||
// i = head ; j = tail (i <= j)
|
||||
size_t j = 0; // Write pointer
|
||||
const size_t len = s.length();
|
||||
|
||||
for (size_t i = 0; i < len; ) {
|
||||
bool handled = false;
|
||||
|
||||
// We need at least 3 characters for any marker: {X- or -X}
|
||||
if (i + 2 < len) {
|
||||
const char c1 = s[i];
|
||||
const char c2 = s[i + 1];
|
||||
const char c3 = s[i + 2];
|
||||
|
||||
// 1. Closing trim: -X} where X = %, }, #
|
||||
// Example: [content]-%} [spaces] -> [content]%}
|
||||
if (c1 == '-' && c3 == '}' && (c2 == '%' || c2 == '}' || c2 == '#')) {
|
||||
s[j++] = c2;
|
||||
s[j++] = '}';
|
||||
i += 3;
|
||||
// Strip leading whitespace AFTER the tag
|
||||
while (i < len && std::isspace(static_cast<unsigned char>(s[i]))) {
|
||||
i++;
|
||||
}
|
||||
handled = true;
|
||||
}
|
||||
// 2. Opening trim: {X- where X = %, {, #
|
||||
// Example: [spaces]{%- [content] -> {% [content]
|
||||
else if (c1 == '{' && c3 == '-' && (c2 == '%' || c2 == '{' || c2 == '#')) {
|
||||
// Trim trailing whitespace BEFORE the tag by moving write pointer back
|
||||
while (j > 0 && std::isspace(static_cast<unsigned char>(s[j - 1]))) {
|
||||
j--;
|
||||
}
|
||||
|
||||
// Safety: Prevent merging '{' with tag start (avoid creating '{{%' or '{{{')
|
||||
// if the character immediately before our new tag is a literal '{'.
|
||||
if (j > 0 && s[j - 1] == '{') {
|
||||
s[j++] = ' ';
|
||||
}
|
||||
|
||||
s[j++] = '{';
|
||||
s[j++] = c2;
|
||||
i += 3;
|
||||
handled = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!handled) {
|
||||
// Note: j is always <= i here, so this is safe.
|
||||
s[j++] = s[i++];
|
||||
}
|
||||
}
|
||||
|
||||
s.resize(j);
|
||||
}
|
||||
|
||||
std::string lexer::preprocess(const std::string & template_str, const preprocess_options & options) const {
|
||||
std::string result = template_str;
|
||||
// According to https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control
|
||||
|
|
@ -40,12 +101,7 @@ std::string lexer::preprocess(const std::string & template_str, const preprocess
|
|||
}
|
||||
|
||||
// Handle whitespace control with - in tags
|
||||
result = std::regex_replace(result, std::regex(R"(-%\}\s*)"), "%}");
|
||||
result = std::regex_replace(result, std::regex(R"(\s*\{%-)"), "{%");
|
||||
result = std::regex_replace(result, std::regex(R"(-\}\}\s*)"), "}}");
|
||||
result = std::regex_replace(result, std::regex(R"(\s*\{\{-)"), "{{");
|
||||
result = std::regex_replace(result, std::regex(R"(-#\}\s*)"), "#}");
|
||||
result = std::regex_replace(result, std::regex(R"(\s*\{\#-)"), "{#");
|
||||
trim_template_markers_inplace(result);
|
||||
|
||||
// Handle custom transformers-specific `generation` tag
|
||||
// See https://github.com/huggingface/transformers/pull/30650 for more information.
|
||||
|
|
|
|||
|
|
@ -26,8 +26,10 @@ class parser {
|
|||
// for debugging; a token can be multiple chars in source
|
||||
std::vector<size_t> tok_pos_to_src_pos;
|
||||
|
||||
std::string source; // for error reporting
|
||||
|
||||
public:
|
||||
parser(const std::vector<token> & t) : tokens(t) {
|
||||
parser(const std::vector<token> & t, const std::string & src) : tokens(t), source(src) {
|
||||
tok_pos_to_src_pos.resize(tokens.size());
|
||||
for (size_t i = 0; i < tokens.size(); i++) {
|
||||
tok_pos_to_src_pos[i] = tokens[i].pos;
|
||||
|
|
@ -46,7 +48,16 @@ public:
|
|||
std::unique_ptr<T> mk_stmt(Args&&... args) {
|
||||
auto ptr = std::make_unique<T>(std::forward<Args>(args)...);
|
||||
ptr->pos = tok_pos_to_src_pos[prev_cur];
|
||||
JJ_DEBUG("Created %s statement at src pos %zu", ptr->type().c_str(), ptr->pos);
|
||||
|
||||
std::string snippet = "no source";
|
||||
if (!source.empty()) {
|
||||
size_t start_pos = ptr->pos;
|
||||
size_t end_pos = start_pos + 20;
|
||||
if (end_pos > source.size()) end_pos = source.size();
|
||||
snippet = source.substr(start_pos, end_pos - start_pos);
|
||||
}
|
||||
JJ_DEBUG("Created %-20s statement at src pos %-4zu (%s)", ptr->type().c_str(), ptr->pos, snippet.c_str());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
|
@ -544,7 +555,9 @@ private:
|
|||
return mk_stmt<integer_literal>(std::stoll(t.value));
|
||||
case token::string_literal: {
|
||||
std::string val = t.value;
|
||||
while (is(token::string_literal)) val += tokens[current++].value;
|
||||
while (is(token::string_literal)) {
|
||||
val += tokens[current++].value;
|
||||
}
|
||||
return mk_stmt<string_literal>(val);
|
||||
}
|
||||
case token::identifier:
|
||||
|
|
@ -575,13 +588,17 @@ private:
|
|||
return mk_stmt<object_literal>(std::move(pairs));
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unexpected token: " + t.value);
|
||||
throw std::runtime_error("Unexpected token: " + t.value + " of type " + std::to_string(t.t));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
program parse_from_tokens(const std::vector<token> & tokens) {
|
||||
return parser(tokens).parse();
|
||||
return parser(tokens, "").parse();
|
||||
}
|
||||
|
||||
program parse_from_tokens(const lexer_result & lexer_res) {
|
||||
return parser(lexer_res.tokens, lexer_res.preprocessed_source).parse();
|
||||
}
|
||||
|
||||
} // namespace jinja
|
||||
|
|
|
|||
|
|
@ -13,4 +13,6 @@ namespace jinja {
|
|||
|
||||
program parse_from_tokens(const std::vector<token> & tokens);
|
||||
|
||||
program parse_from_tokens(const lexer_result & lexer_res);
|
||||
|
||||
} // namespace jinja
|
||||
|
|
|
|||
|
|
@ -131,23 +131,23 @@ const func_builtins & global_builtins() {
|
|||
if (args.args.size() < 1 || args.args.size() > 3) {
|
||||
throw raised_exception("slice() takes between 1 and 3 arguments");
|
||||
}
|
||||
int64_t arg0 = is_val<value_int>(args.args[0]) ? args.args[0]->as_int() : 0;
|
||||
int64_t arg1 = is_val<value_int>(args.args[1]) ? args.args[1]->as_int() : -1;
|
||||
int64_t arg2 = is_val<value_int>(args.args[2]) ? args.args[2]->as_int() : 1;
|
||||
auto & arg0 = args.args[0];
|
||||
auto & arg1 = args.args[1];
|
||||
auto & arg2 = args.args[2];
|
||||
|
||||
int64_t start, stop, step;
|
||||
if (args.args.size() == 1) {
|
||||
start = 0;
|
||||
stop = arg0;
|
||||
stop = arg0->as_int();
|
||||
step = 1;
|
||||
} else if (args.args.size() == 2) {
|
||||
start = arg0;
|
||||
stop = arg1;
|
||||
start = arg0->as_int();
|
||||
stop = arg1->as_int();
|
||||
step = 1;
|
||||
} else {
|
||||
start = arg0;
|
||||
stop = arg1;
|
||||
step = arg2;
|
||||
start = arg0->as_int();
|
||||
stop = arg1->as_int();
|
||||
step = arg2->as_int();
|
||||
}
|
||||
|
||||
auto out = mk_val<value_array>();
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@
|
|||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#define JJ_DEBUG(msg, ...) if (g_jinja_debug) printf("%s:%3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)
|
||||
#define JJ_DEBUG(msg, ...) if (g_jinja_debug) printf("%s:%-3d : " msg "\n", FILENAME, __LINE__, __VA_ARGS__)
|
||||
|
||||
extern bool g_jinja_debug;
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ int main(void) {
|
|||
|
||||
std::vector<std::string> failed_tests;
|
||||
|
||||
bool stop_on_first_failure = false;
|
||||
|
||||
auto is_ignored_file = [](const std::string & filename) -> bool {
|
||||
std::vector<std::string> ignored_files = {
|
||||
"Apriel-",
|
||||
|
|
@ -64,7 +66,9 @@ int main(void) {
|
|||
std::cout << "Exception: " << e.what() << "\n";
|
||||
std::cout << "=== ERROR WITH TEMPLATE FILE: " << entry.path().string() << " ===\n";
|
||||
failed_tests.push_back(entry.path().string());
|
||||
exit(1);
|
||||
if (stop_on_first_failure) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -85,7 +89,7 @@ void run(std::string contents) {
|
|||
|
||||
jinja::lexer lexer;
|
||||
jinja::preprocess_options options;
|
||||
options.trim_blocks = true;
|
||||
options.trim_blocks = false;
|
||||
options.lstrip_blocks = false;
|
||||
auto lexer_res = lexer.tokenize(contents, options);
|
||||
for (const auto & tok : lexer_res.tokens) {
|
||||
|
|
@ -93,7 +97,7 @@ void run(std::string contents) {
|
|||
}
|
||||
|
||||
std::cout << "\n=== AST ===\n";
|
||||
jinja::program ast = jinja::parse_from_tokens(lexer_res.tokens);
|
||||
jinja::program ast = jinja::parse_from_tokens(lexer_res);
|
||||
for (const auto & stmt : ast.body) {
|
||||
//std::cout << "stmt type: " << stmt->type() << "\n";
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue