llama.cpp/common/jinja/lexer.h

158 lines
5.2 KiB
C++

#pragma once
#include "utils.h"
#include <cctype>
#include <map>
#include <stdexcept>
#include <string>
#include <vector>
namespace jinja {
struct token {
enum type {
eof, // end of source
text, // The text between Jinja statements or expressions
numeric_literal, // e.g., 123, 1.0
string_literal, // 'string'
identifier, // Variables, functions, statements, booleans, etc.
equals, // =
open_paren, // (
close_paren, // )
open_statement, // {%
close_statement, // %}
open_expression, // {{
close_expression, // }}
open_square_bracket, // [
close_square_bracket, // ]
open_curly_bracket, // {
close_curly_bracket, // }
comma, // ,
dot, // .
colon, // :
pipe, // |
call_operator, // ()
additive_binary_operator, // + - ~
multiplicative_binary_operator, // * / %
comparison_binary_operator, // < > <= >= == !=
unary_operator, // ! - +
comment, // {# ... #}
};
type t;
std::string value;
size_t pos;
};
static std::string type_to_string(token::type t) {
switch (t) {
case token::eof: return "eof";
case token::text: return "text";
case token::numeric_literal: return "numeric_literal";
case token::string_literal: return "string_literal";
case token::identifier: return "identifier";
case token::equals: return "equals";
case token::open_paren: return "open_paren";
case token::close_paren: return "close_paren";
case token::open_statement: return "open_statement";
case token::close_statement: return "close_statement";
case token::open_expression: return "open_expression";
case token::close_expression: return "close_expression";
case token::open_square_bracket: return "open_square_bracket";
case token::close_square_bracket: return "close_square_bracket";
case token::open_curly_bracket: return "open_curly_bracket";
case token::close_curly_bracket: return "close_curly_bracket";
case token::comma: return "comma";
case token::dot: return "dot";
case token::colon: return "colon";
case token::pipe: return "pipe";
case token::call_operator: return "call_operator";
case token::additive_binary_operator: return "additive_binary_operator";
case token::multiplicative_binary_operator: return "multiplicative_binary_operator";
case token::comparison_binary_operator: return "comparison_binary_operator";
case token::unary_operator: return "unary_operator";
case token::comment: return "comment";
default: return "unknown";
}
}
struct lexer_result {
std::vector<token> tokens;
std::string source;
};
struct lexer {
const std::map<char, char> escape_chars = {
{'n', '\n'},
{'t', '\t'},
{'r', '\r'},
{'b', '\b'},
{'f', '\f'},
{'v', '\v'},
{'\\', '\\'},
{'\'', '\''},
{'\"', '\"'},
};
static bool is_word(char c) {
return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
}
static bool is_integer(char c) {
return std::isdigit(static_cast<unsigned char>(c));
}
const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = {
// Trimmed control sequences
{"{%-", token::open_statement},
{"-%}", token::close_statement},
{"{{-", token::open_expression},
{"-}}", token::close_expression},
// Control sequences
{"{%", token::open_statement},
{"%}", token::close_statement},
{"{{", token::open_expression},
{"}}", token::close_expression},
// Single character tokens
{"(", token::open_paren},
{")", token::close_paren},
{"{", token::open_curly_bracket},
{"}", token::close_curly_bracket},
{"[", token::open_square_bracket},
{"]", token::close_square_bracket},
{",", token::comma},
{".", token::dot},
{":", token::colon},
{"|", token::pipe},
// Comparison operators
{"<=", token::comparison_binary_operator},
{">=", token::comparison_binary_operator},
{"==", token::comparison_binary_operator},
{"!=", token::comparison_binary_operator},
{"<", token::comparison_binary_operator},
{">", token::comparison_binary_operator},
// Arithmetic operators
{"+", token::additive_binary_operator},
{"-", token::additive_binary_operator},
{"~", token::additive_binary_operator},
{"*", token::multiplicative_binary_operator},
{"/", token::multiplicative_binary_operator},
{"%", token::multiplicative_binary_operator},
// Assignment operator
{"=", token::equals},
};
// tokenize the source string into a list of tokens
// may throw lexer_exception on error
lexer_result tokenize(const std::string & source);
};
struct lexer_exception : public std::runtime_error {
lexer_exception(const std::string & msg, const std::string & source, size_t pos)
: std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {}
};
} // namespace jinja