Merge 8df64b28a7 into 88915cb55c

2026-03-15 19:02:01 +00:00 · 2026-03-15 19:02:01 +00:00 · 94b25e23e0
parent 88915cb55c 8df64b28a7
commit 94b25e23e0
33 changed files with 553 additions and 415 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1833,6 +1833,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
        [](common_params & params, const std::string & value) {
            params.sampling.grammar = value;
+            params.sampling.grammar_external = true;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1840,6 +1841,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "file to read grammar from",
        [](common_params & params, const std::string & value) {
            params.sampling.grammar = read_file(value);
+            params.sampling.grammar_external = true;
        }
    ).set_sparam());
    add_opt(common_arg(
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -1,3 +1,4 @@
+#include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
@ -23,13 +24,13 @@ static void foreach_function(const json & tools, const std::function<void(const

 namespace autoparser {

-parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
+parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
    p(p),
    inputs(inputs),
    reasoning_parser(p.eps()) {}

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct templates_params & inputs) {
+                                                  const struct generation_params & inputs) {
    // Run differential analysis to extract template structure
    struct autoparser autoparser;
    autoparser.analyze_template(tmpl);
@ -37,17 +38,16 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
 }

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct templates_params & inputs,
+                                                  const struct generation_params & inputs,
                                                  const autoparser &              autoparser) {
-    // Build the parser using the analysis results
-    auto parser = autoparser.build_parser(inputs);
-
    // Create the result structure
    common_chat_params data;
    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;
-    data.parser           = parser.save();
+
+    auto parser = autoparser.build_parser(inputs);
+    data.parser = parser.save();

    // Build grammar if tools are present
    bool has_tools =
@ -82,44 +82,38 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    return data;
 }

-common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
+common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        // If the template uses Python dict format (single-quoted strings in JSON structures),
-        // pre-register a json-string rule that accepts both quote styles. This must happen
-        // before any call to p.json() so that all JSON parsing inherits the flexible rule.
-        if (tools.format.uses_python_dicts) {
-            p.rule("json-string", p.quoted_string());
-        }
-
        parser_build_context ctx(p, inputs);
        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-        bool                 enable_thinking   = inputs.enable_thinking;

-        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
+        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;

        // Build reasoning parser
        ctx.reasoning_parser = reasoning.build_parser(ctx);

+        auto parser = p.eps();
+
        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();

        if (has_response_format) {
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-            return ctx.reasoning_parser + p.space() + p.choice({
+            parser = ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
                response_format
            }) + p.end();
+        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
+            parser = tools.build_parser(ctx);
+        } else {
+            parser = content.build_parser(ctx);
        }
-
-        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
-            return tools.build_parser(ctx);
-        }
-
-        return content.build_parser(ctx);
+        parser = wrap_for_generation_prompt(p, parser, inputs, reasoning);
+        return parser;
    });
 }

@ -130,24 +124,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        return p.eps();
    }

-    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
-    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
-    if (thinking_forced_open || thinking_forced_closed) {
-        // Thinking is forced open OR forced closed with enable_thinking=true
-        // In both cases, expect only the closing tag (opening was in template)
-        // However, since we might have incorrectly detected the open/close pattern,
-        // we admit an optional starting marker
-        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
-    }
    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
-        // Both use the same tag-based pattern if markers are available
-        if (!start.empty() && !end.empty()) {
-            return p.optional(start + p.reasoning(p.until(end)) + end);
+        if (!end.empty()) {
+            if (!start.empty()) {
+                // Standard tag-based: optional(<think>reasoning</think>)
+                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+            }
+            // Delimiter-style (empty start)
+            return p.optional(p.reasoning(p.until(end)) + end + p.space());
        }
-    } else if (mode == reasoning_mode::DELIMITER) {
-        return p.optional(p.reasoning(p.until(end)) + end);
    }

    return p.eps();
@ -335,7 +320,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
                                                                     param_schema, true)) :
                                    p.tool_arg_json_value(p.schema(
-                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
+                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                        p.space()) +
                p.tool_arg_close(p.literal(arguments.value_suffix)));

--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@ -1,9 +1,11 @@
 #include "chat-auto-parser-helpers.h"

 #include "chat-auto-parser.h"
+#include "chat-peg-parser.h"
 #include "chat.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
+#include "peg-parser.h"

 #include <cctype>
 #include <numeric>
@ -291,10 +293,26 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
    return result;
 }

+common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder &             p,
+                                             const common_peg_parser &             prs,
+                                             const autoparser::generation_params & inputs,
+                                             const autoparser::analyze_reasoning & reasoning) {
+    auto parser = prs;
+    if (!inputs.generation_prompt.empty()) {
+        size_t end_pos = inputs.generation_prompt.size();
+        if (!reasoning.start.empty() && inputs.generation_prompt.find(reasoning.start) != std::string::npos) {
+            end_pos = inputs.generation_prompt.find(reasoning.start);
+        }
+        std::string cut_genprompt = inputs.generation_prompt.substr(0, end_pos);
+        parser                    = p.literal(cut_genprompt) + parser;
+    }
+    return parser;
+}
+
 namespace autoparser {

 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
-    templates_params tmpl_params;
+    generation_params tmpl_params;
    tmpl_params.messages              = params.messages;
    tmpl_params.tools                 = params.tools;
    tmpl_params.add_generation_prompt = params.add_generation_prompt;
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@ -1,6 +1,7 @@
 #pragma once

 #include "chat-auto-parser.h"
+#include "peg-parser.h"
 #include <functional>
 #include <optional>
 #include <string>
@ -57,6 +58,11 @@ std::vector<segment> segmentize_markers(const std::string & text);
 //                                   (MARKER, "</function>"), (MARKER, "</tool_call>") ]
 std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);

+// Wrap parser with generation prompt parser
+common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder &             p,
+                                             const common_peg_parser &             prs,
+                                             const autoparser::generation_params & inputs,
+                                             const autoparser::analyze_reasoning & reasoning);
 namespace autoparser {

 // Apply a template with the given parameters, returning the rendered string (empty on failure)
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -50,7 +50,7 @@ namespace autoparser {
 // High-level params for parser generation
 // ============================================================================

-struct templates_params {
+struct generation_params {
    json                                  messages;
    json                                  tools;
    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
@ -62,6 +62,7 @@ struct templates_params {
    bool                                  add_generation_prompt = false;
    bool                                  enable_thinking       = true;
    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
+    std::string                           generation_prompt;
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
@ -77,11 +78,7 @@ struct templates_params {
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Standard tag-based: <think>...</think>
-    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
-    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
-    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
-                    // with both opened and closed tag for disabled thinking
+    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
    TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };

@ -91,12 +88,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
            return os << "NONE";
        case reasoning_mode::TAG_BASED:
            return os << "TAG_BASED";
-        case reasoning_mode::DELIMITER:
-            return os << "DELIMITER";
-        case reasoning_mode::FORCED_OPEN:
-            return os << "FORCED_OPEN";
-        case reasoning_mode::FORCED_CLOSED:
-            return os << "FORCED_CLOSED";
        case reasoning_mode::TOOLS_ONLY:
            return os << "TOOLS_ONLY";
        default:
@ -184,7 +175,6 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
-    bool uses_python_dicts = false;     // Tool call args use Python dict format (single-quoted strings)

    std::string              function_field = "function";
    std::string              name_field     = "name";
@ -225,12 +215,12 @@ struct analyze_content;

 struct parser_build_context {
    common_chat_peg_builder & p;
-    const templates_params &          inputs;
+    const generation_params &          inputs;
    common_peg_parser                 reasoning_parser;
    bool                              extracting_reasoning = false;
    const analyze_content *           content              = nullptr;

-    parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
+    parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
 };

 // ============================================================================
@ -260,6 +250,7 @@ struct analyze_reasoning : analyze_base {

    analyze_reasoning() = default;
    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
+    analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}

    common_peg_parser build_parser(parser_build_context & ctx) const override;

@ -381,7 +372,7 @@ struct autoparser {
    void analyze_template(const common_chat_template & tmpl);

    // Build the PEG parser for this template
-    common_peg_arena build_parser(const templates_params & inputs) const;
+    common_peg_arena build_parser(const generation_params & inputs) const;

  private:
    // Collect tokens from entire analysis to preserve
@ -395,10 +386,10 @@ struct autoparser {
 class peg_generator {
  public:
    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct templates_params & inputs);
+                                              const struct generation_params & inputs);

    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct templates_params & inputs,
+                                              const struct generation_params & inputs,
                                              const autoparser &              autoparser);
 };

--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -2,6 +2,7 @@
 #include "chat-auto-parser-helpers.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
+#include "common.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
 #include "peg-parser.h"
@ -31,8 +32,9 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
          if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
              tmpl.src.find("reasoning_content") == std::string::npos &&
+              tmpl.src.find("<SPECIAL_12>") == std::string::npos &&
              analysis.reasoning.mode == reasoning_mode::NONE) {
-              analysis.reasoning.mode  = reasoning_mode::FORCED_OPEN;
+              analysis.reasoning.mode  = reasoning_mode::TAG_BASED;
              analysis.reasoning.start = "<think>";
              analysis.reasoning.end   = "</think>";
              analysis.preserved_tokens.push_back("<think>");
@ -185,7 +187,6 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
    LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
    LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
-    LOG_DBG("python_dict_format: %s\n", tools.format.uses_python_dicts ? "true" : "false");
    LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
    LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
    LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
@ -295,16 +296,12 @@ void analyze_reasoning::compare_reasoning_presence() {
        }
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
-                if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
-                    mode = reasoning_mode::TAG_BASED;
-                } else {
-                    mode = reasoning_mode::FORCED_CLOSED;
-                }
+                mode = reasoning_mode::TAG_BASED;
                start = trim_whitespace(result.tags["pre"]);
-                end   = result.tags["post"];
+                end   = trim_trailing_whitespace(result.tags["post"]);
            } else if (!result.tags["post"].empty()) {
-                mode = reasoning_mode::DELIMITER;
-                end = result.tags["post"];
+                mode = reasoning_mode::TAG_BASED;
+                end = trim_trailing_whitespace(result.tags["post"]);
            }
        }
    }
@ -331,53 +328,30 @@ void analyze_reasoning::compare_thinking_enabled() {
    const auto & diff = comparison->diff;

    std::string left_trimmed = trim_whitespace(diff.left);
+    std::string right_trimmed = trim_whitespace(diff.right);

    if (left_trimmed.empty() && !diff.right.empty()) {
-        std::string right_trimmed = trim_whitespace(diff.right);
-
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
                start = right_trimmed;
-                mode  = reasoning_mode::FORCED_OPEN;
+                mode  = reasoning_mode::TAG_BASED;
+            }
+        }
+    } else if (right_trimmed.empty() && !diff.left.empty()) {
+        if (!left_trimmed.empty() && string_ends_with(comparison->output_A, left_trimmed)) {
+            if (end.empty()) {
+                auto seg = prune_whitespace_segments(segmentize_markers(comparison->output_A));
+                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
+                    start = seg[seg.size() - 2].value;
+                }
+                end = left_trimmed;
+                mode = reasoning_mode::TAG_BASED;
            }
        }
    }

-    if (start.empty() && !end.empty()) {
-        mode = reasoning_mode::DELIMITER;
-    }
-
-    // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
-    // but enable_thinking=true produces only the start marker
-    if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
-        auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.literal(start) + p.space() + p.literal(end) + p.rest();
-        });
-        auto parser_start_end = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.literal(start)) + p.space() + p.negate(p.literal(end)) + p.rest();
-        });
-        if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
-            parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
-            mode = reasoning_mode::FORCED_CLOSED;
-        } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
-            auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
-            if (result.result.success()) {
-                start = result.tags["pre"];
-                mode  = reasoning_mode::FORCED_CLOSED;
-            }
-        }
-    }
-
-    if (start.empty() && end.empty()) {  // we might still have the case of "just open" and "just close"
-        if (!diff.left.empty() && !diff.right.empty()) {
-            auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
-            auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
-            if (seg_A.size() == 1 && seg_B.size() == 1) {
-                mode = reasoning_mode::FORCED_CLOSED;
-                start = seg_B[0].value;
-                end = seg_A[0].value;
-            }
-        }
+    if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
+        mode = reasoning_mode::TAG_BASED;
    }
 }

@ -426,14 +400,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = result.tags["post"];
+            end = trim_trailing_whitespace(result.tags["post"]);
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = result.tags["post"];
+                end = trim_trailing_whitespace(result.tags["post"]);
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extracft reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
@ -600,33 +574,23 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
        return;
    }

-    enum class json_quote_style { NONE, DOUBLE_QUOTES, SINGLE_QUOTES };
-
-    auto in_json_haystack = [&haystack](const std::string & needle) -> json_quote_style {
+    auto in_json_haystack = [&haystack](const std::string & needle) -> bool {
        auto parser = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
            return p.choice({ p.literal("{"), p.literal(":") }) << p.choice({
-                p.tag("sq", p.literal("'") + p.literal(needle) + p.literal("'")),
                p.tag("dq", p.literal("\"") + p.literal(needle) + p.literal("\"")) });
        });
        auto result = parser.parse_anywhere_and_extract(haystack);
-        if (!result.result.success()) {
-            return json_quote_style::NONE;
-        }
-        return result.tags.count("sq") && !result.tags["sq"].empty()
-            ? json_quote_style::SINGLE_QUOTES
-            : json_quote_style::DOUBLE_QUOTES;
+        return result.result.success();
    };

    auto fun_quote = in_json_haystack(fun_name_needle);
    auto arg_quote = in_json_haystack(arg_name_needle);

-    if (fun_quote != json_quote_style::NONE) {
+    if (fun_quote) {
        // no need to check further, we're in JSON land
        format.mode = tool_format::JSON_NATIVE;
-        format.uses_python_dicts = (fun_quote == json_quote_style::SINGLE_QUOTES);
-    } else if (arg_quote != json_quote_style::NONE) {
+    } else if (arg_quote) {
        format.mode = tool_format::TAG_WITH_JSON;
-        format.uses_python_dicts = (arg_quote == json_quote_style::SINGLE_QUOTES);
    } else {
        format.mode = tool_format::TAG_WITH_TAGGED;
    }
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
        result.tool_calls.push_back(pending_tool_call.value());
        pending_tool_call.reset();
    }
+
+    // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
+    if (!result.reasoning_content.empty()) {
+        bool all_whitespace = true;
+        for (char c : result.reasoning_content) {
+            if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
+                all_whitespace = false;
+                break;
+            }
+        }
+        if (all_whitespace) {
+            result.reasoning_content.clear();
+        }
+    }
 }

 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1,5 +1,6 @@
 #include "chat.h"

+#include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "common.h"
@ -22,6 +23,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <utility>
 #include <vector>

 using json = nlohmann::ordered_json;
@ -760,7 +762,7 @@ static void foreach_parameter(const json &

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::templates_params & inputs,
+    const autoparser::generation_params & inputs,
    const std::optional<json> & messages_override,
    const std::optional<json> & tools_override,
    const std::optional<json> & additional_context) {
@ -811,7 +813,7 @@ std::string common_chat_template_direct_apply(
 }

 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
-                                                              const autoparser::templates_params & inputs) {
+                                                              const autoparser::generation_params & inputs) {
    common_chat_params data;

    // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
@ -928,7 +930,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
 }

 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template &    tmpl,
-                                                          const autoparser::templates_params & inputs) {
+                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
@ -1074,7 +1076,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

 // Functionary v3.2 - uses recipient-based format: >>>recipient\n{content}
 static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template &    tmpl,
-                                                                   const autoparser::templates_params & inputs) {
+                                                                   const autoparser::generation_params & inputs) {
    common_chat_params data;

    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
@ -1095,13 +1097,13 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        // Build content parser for >>>all\n{content}
        // When tools are present, content stops before the next ">>>" (tool call)
        // When no tools, content goes until end
-        auto content_until_tool = p.literal(">>>all\n") + p.content(p.until(">>>"));
-        auto content_until_end  = p.literal(">>>all\n") + p.content(p.rest());
+        auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
+        auto content_until_end  = p.literal("all\n") + p.content(p.rest());

        // If no tools or tool_choice is NONE, just parse content
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
            // When no tools, just match the prefix and capture everything after
-            return content_until_end + p.end();
+            return wrap_for_generation_prompt(p, content_until_end + p.end(), inputs, autoparser::analyze_reasoning());
        }

        // Build tool call parsers for each available function
@ -1113,7 +1115,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_

            // Tool format: >>>function_name\n{json_args}
            auto tool_parser = p.tool(
-                p.tool_open(p.literal(">>>") + p.tool_name(p.literal(name)) + p.literal("\n")) +
+                p.tool_open(p.tool_name(p.literal(name)) + p.literal("\n")) +
                p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
            );

@ -1124,17 +1126,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        auto tools_only = p.trigger_rule("tools", p.one_or_more(tool_choice));
        auto content_and_tools = content_until_tool + tools_only;

+        auto ret = p.eps();
        if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
            if (inputs.parallel_tool_calls) {
-                return p.choice({ content_and_tools, tools_only }) + p.end();
+                ret = p.choice({ content_and_tools, tools_only }) + p.end();
+            } else {
+                ret = p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
            }
-            return p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
+        } else if (inputs.parallel_tool_calls) {
+            ret = p.choice({ content_and_tools, content_only, tools_only }) + p.end();
+        } else {
+            auto content_and_tool = content_until_tool + tool_choice;
+            ret = p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
        }
-        if (inputs.parallel_tool_calls) {
-            return p.choice({ content_and_tools, content_only, tools_only }) + p.end();
-        }
-        auto content_and_tool = content_until_tool + tool_choice;
-        return p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
+        return wrap_for_generation_prompt(p, ret, inputs, autoparser::analyze_reasoning());
    });

    data.parser = parser.save();
@ -1164,14 +1169,12 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
 // Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
 // The ID contains both the function name and an incrementing counter
 static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template &    tmpl,
-                                                          const autoparser::templates_params & inputs) {
+                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

    data.prompt             = common_chat_template_direct_apply(tmpl, inputs);
    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking  = true;
-    data.thinking_start_tag = "<think>";
-    data.thinking_end_tag   = "</think>";
    data.preserved_tokens  = {
        "<|tool_calls_section_begin|>",
        "<|tool_calls_section_end|>",
@ -1186,6 +1189,18 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

+    const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
+    const std::string SECTION_END   = "<|tool_calls_section_end|>";
+    const std::string CALL_BEGIN    = "<|tool_call_begin|>";
+    const std::string ARGS_BEGIN    = "<|tool_call_argument_begin|>";
+    const std::string CALL_END      = "<|tool_call_end|>";
+
+    const std::string THINK_START = "<think>";
+    const std::string THINK_END   = "</think>";
+
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Kimi K2 Thinking format:
        // - Reasoning: <think>{reasoning}</think>
@ -1197,16 +1212,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
        //   <|tool_calls_section_end|>
        // The ID format is: functions.<function_name>:<counter> where counter is 0, 1, 2, ...

-                // Tool call markers
-        const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
-        const std::string SECTION_END   = "<|tool_calls_section_end|>";
-        const std::string CALL_BEGIN    = "<|tool_call_begin|>";
-        const std::string ARGS_BEGIN    = "<|tool_call_argument_begin|>";
-        const std::string CALL_END      = "<|tool_call_end|>";
-
-        const std::string THINK_START   = "<think>";
-        const std::string THINK_END     = "</think>";
-
+        // Tool call markers
        auto end = p.end();

        // Note: this model is CRAZY. It can diverge from its supposed tool calling pattern in so many ways it's not funny.
@ -1218,7 +1224,8 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp

        // Content only parser (no tools)
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return reasoning + p.content(p.rest()) + end;
+            return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end,
+                inputs, autoparser::analyze_reasoning(THINK_START, THINK_END));
        }

        // Build tool call parsers for each available function
@ -1254,7 +1261,8 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp

        auto content_before_tools = p.content(p.until_one_of({ SECTION_BEGIN, CALL_BEGIN }));

-        return reasoning + content_before_tools + tool_calls + end;
+        return wrap_for_generation_prompt(p, reasoning + content_before_tools + tool_calls + end,
+            inputs, autoparser::analyze_reasoning(THINK_START, THINK_END));
    });

    data.parser = parser.save();
@ -1284,7 +1292,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
 // - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
 // Tool calls can appear multiple times (parallel tool calls)
 static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
-                                                       const autoparser::templates_params & inputs) {
+                                                       const autoparser::generation_params & inputs) {
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
@ -1303,13 +1311,15 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

-
    const std::string TOOL_CALL_START = "<|tool_call_start|>";
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {

+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        auto end = p.end();

        auto reasoning = p.eps();
@ -1318,7 +1328,8 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        }

        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return reasoning + p.content(p.rest()) + end;
+            return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end, inputs,
+                autoparser::analyze_reasoning(THINK_START, THINK_END));
        }

        auto tool_calls = p.rule("tool-calls",
@ -1330,7 +1341,8 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat

        auto content = p.content(p.until(TOOL_CALL_START));

-        return reasoning + content + tool_calls + end;
+        return wrap_for_generation_prompt(p, reasoning + content + tool_calls + end, inputs,
+            autoparser::analyze_reasoning(THINK_START, THINK_END));
    });

    data.parser = parser.save();
@ -1356,7 +1368,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat

 static common_chat_params common_chat_params_init_gigachat_v3(
        const common_chat_template & tmpl,
-        const autoparser::templates_params & inputs) {
+        const autoparser::generation_params & inputs) {

    common_chat_params data;

@ -1373,6 +1385,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
    auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto ret = p.eps();
        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
            // Build a choice of all available tools
            auto tool_choice = p.choice();
@ -1395,13 +1408,14 @@ static common_chat_params common_chat_params_init_gigachat_v3(
            auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
            auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));

-            return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
+            ret = p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
+        } else {
+            // Content only parser
+            include_grammar = false;
+            ret = p.content(p.rest());
        }

-        // Content only parser
-        include_grammar = false;
-        return p.content(p.rest());
-
+        return wrap_for_generation_prompt(p, ret, inputs, autoparser::analyze_reasoning());
    });

    data.parser = parser.save();
@ -1498,22 +1512,20 @@ static json common_chat_extra_context() {

 static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
                                                            const struct common_chat_templates_inputs & inputs) {
-    autoparser::templates_params params;
+    autoparser::generation_params params;
    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
-    const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
-        ? *tmpls->template_tool_use
-        : *tmpls->template_default;
-    const auto & src = tmpl.source();
-    const auto & caps = tmpl.original_caps();
-    params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.add_generation_prompt = inputs.add_generation_prompt;
-    params.tool_choice = inputs.tool_choice;
+    const auto & tmpl =
+        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
+    const auto & src        = tmpl.source();
+    const auto & caps       = tmpl.original_caps();
+    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.tool_choice      = inputs.tool_choice;
    params.reasoning_format = inputs.reasoning_format;
-    params.enable_thinking = inputs.enable_thinking;
-    params.grammar = inputs.grammar;
-    params.now = inputs.now;
-    params.add_bos = tmpls->add_bos;
-    params.add_eos = tmpls->add_eos;
+    params.enable_thinking  = inputs.enable_thinking;
+    params.grammar          = inputs.grammar;
+    params.now              = inputs.now;
+    params.add_bos          = tmpls->add_bos;
+    params.add_eos          = tmpls->add_eos;

    if (src.find("<|channel|>") == std::string::npos) {
        // map developer to system for all models except for GPT-OSS
@ -1532,6 +1544,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::requires_non_null_content(params.messages);
    }

+    params.add_generation_prompt = false;
+    std::string no_gen_prompt    = common_chat_template_direct_apply(tmpl, params);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply(tmpl, params);
+    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
+    params.generation_prompt     = diff.right;
+
+    params.add_generation_prompt = inputs.add_generation_prompt;
+
    params.extra_context = common_chat_extra_context();
    for (auto el : inputs.chat_template_kwargs) {
        params.extra_context[el.first] = json::parse(el.second);
@ -1541,12 +1562,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        params.json_schema = json::parse(inputs.json_schema);
    }

-    // if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
-    //     LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
-    //     params.parallel_tool_calls = false;
-    // } else {
    params.parallel_tool_calls = inputs.parallel_tool_calls;
-    //}

    if (params.tools.is_array()) {
        if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) {
@ -1559,25 +1575,27 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        }
    }

+    common_chat_params early_return;
+
    // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
    // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
    if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
        src.find("[ARGS]") != std::string::npos && src.find("[CALL_ID]") == std::string::npos) {
        LOG_DBG("Using specialized template: Ministral/Magistral Large 3\n");
-        return common_chat_params_init_ministral_3(tmpl, params);
+        early_return = common_chat_params_init_ministral_3(tmpl, params);
    }

    // GPT-OSS - has unique channel-based structure that needs dedicated handler
    if (src.find("<|channel|>") != std::string::npos) {
        LOG_DBG("Using specialized template: GPT-OSS\n");
-        return common_chat_params_init_gpt_oss(tmpl, params);
+        early_return = common_chat_params_init_gpt_oss(tmpl, params);
    }

    // Functionary v3.2 - uses recipient-based format with >>>recipient\n{content}
    // Detection: template has ">>>all" for content and ">>>" prefix for tool calls
    if (src.find(">>>all") != std::string::npos && src.find(">>>${recipient}") != std::string::npos) {
        LOG_DBG("Using specialized template: Functionary v3.2\n");
-        return common_chat_params_init_functionary_v3_2(tmpl, params);
+        early_return = common_chat_params_init_functionary_v3_2(tmpl, params);
    }

    // Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
@ -1585,7 +1603,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    if (src.find("<|tool_calls_section_begin|>") != std::string::npos &&
        src.find("<|tool_call_begin|>") != std::string::npos) {
        LOG_DBG("Using specialized template: Kimi K2 Thinking\n");
-        return common_chat_params_init_kimi_k2(tmpl, params);
+        early_return = common_chat_params_init_kimi_k2(tmpl, params);
    }

    // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
@ -1593,7 +1611,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    if (src.find("<|tool_list_start|>") != std::string::npos &&
        src.find("<|tool_list_end|>") != std::string::npos) {
        LOG_DBG("Using specialized template: LFM2\n");
-        return common_chat_params_init_lfm2(tmpl, params);
+        early_return = common_chat_params_init_lfm2(tmpl, params);
    }

    // GigaChatV3 format detection
@ -1602,11 +1620,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        src.find("<|function_call|>") == std::string::npos
    ) {
        LOG_DBG("Using specialized template: GigaChatV3\n");
-        return common_chat_params_init_gigachat_v3(tmpl, params);
+        early_return = common_chat_params_init_gigachat_v3(tmpl, params);
+    }
+
+    if (!early_return.parser.empty()) {
+        early_return.generation_prompt = params.generation_prompt;
+        return early_return;
    }

    try {
-        LOG_DBG("Using differential autoparser\n");
+        LOG_DBG("%s: using differential autoparser\n", __func__);
        struct autoparser::autoparser autoparser;
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
@ -1614,13 +1637,11 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        if (auto_params.supports_thinking) {
            auto_params.thinking_start_tag = autoparser.reasoning.start;
            auto_params.thinking_end_tag   = autoparser.reasoning.end;
-            // FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
-            // (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
-            //  but forces <think> open when thinking is enabled)
-            auto_params.thinking_forced_open =
-                autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
-                autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
        }
+        auto_params.generation_prompt = params.generation_prompt;
+        common_peg_arena arena;
+        arena.load(auto_params.parser);
+        LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
        return auto_params;
    } catch (const std::exception & e) {
        throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
@ -1718,14 +1739,18 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        LOG_DBG("No parser definition detected, assuming pure content parser.");
    }

-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
+    const std::string effective_input = params.generation_prompt.empty()
+        ? input
+        : params.generation_prompt + input;
+
+    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());

    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
    if (params.debug) {
        flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
    }

-    common_peg_parse_context ctx(input, flags);
+    common_peg_parse_context ctx(effective_input, flags);
    auto result = parser.parse(ctx);

    if (result.fail()) {
--- a/common/chat.h
+++ b/common/chat.h
@ -24,7 +24,7 @@ using json = nlohmann::ordered_json;
 struct common_chat_templates;

 namespace autoparser {
-struct templates_params;
+struct generation_params;
 }  // namespace autoparser

 struct common_chat_tool_call {
@ -211,7 +211,7 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy         = false;
-    bool                                thinking_forced_open = false;
+    std::string                         generation_prompt;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
@ -228,14 +228,14 @@ struct common_chat_parser_params {
    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                    reasoning_in_content = false;
-    bool                    thinking_forced_open = false;
+    std::string             generation_prompt;
    bool                    parse_tool_calls     = true;
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
-        format               = chat_params.format;
-        thinking_forced_open = chat_params.thinking_forced_open;
+        format  = chat_params.format;
+        generation_prompt = chat_params.generation_prompt;
    }
 };

@ -301,7 +301,7 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::templates_params & inputs,
+    const autoparser::generation_params & inputs,
    const std::optional<json> & messages_override = std::nullopt,
    const std::optional<json> & tools_override = std::nullopt,
    const std::optional<json> & additional_context = std::nullopt);
--- a/common/common.h
+++ b/common/common.h
@ -231,15 +231,21 @@ struct common_params_sampling {
    std::string                         grammar; // optional BNF-like grammar to constrain sampling
    bool                                grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
+    bool                                grammar_external = false; // is the grammar set by the user explicitly?
+                                                                  // if so, we must not pass extra grammar prefill to it
    std::set<llama_token>               preserved_tokens;

    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

+    // Grammar prefill: tokens already present in the prompt generation message.
+    // Fed to the grammar sampler (to advance past pre-existing tokens) and used
+    // to determine the reasoning budget sampler's initial state.
+    std::string grammar_prefill;
+
    // reasoning budget sampler parameters
    // these are populated by the server/CLI based on chat template params
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
-    bool                     reasoning_budget_activate_immediately = false;
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@ -163,9 +163,15 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
    ctx->force_pos = 0;
 }

+// forward declaration for use in clone
+static struct llama_sampler * common_reasoning_budget_init_state(
+        const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens,
+        int32_t budget, common_reasoning_budget_state initial_state);
+
 static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
-    return common_reasoning_budget_init(
+    return common_reasoning_budget_init_state(
        ctx->vocab,
        ctx->start_matcher.tokens,
        ctx->end_matcher.tokens,
@ -191,13 +197,13 @@ static struct llama_sampler_i common_reasoning_budget_i = {
    /* .backend_set_input = */ nullptr,
 };

-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        common_reasoning_budget_state    initial_state) {
+static struct llama_sampler * common_reasoning_budget_init_state(
+        const struct llama_vocab             * vocab,
+        const std::vector<llama_token>       & start_tokens,
+        const std::vector<llama_token>       & end_tokens,
+        const std::vector<llama_token>       & forced_tokens,
+        int32_t                                budget,
+        common_reasoning_budget_state          initial_state) {
    // promote COUNTING with budget <= 0 to FORCING
    if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
        initial_state = REASONING_BUDGET_FORCING;
@ -217,3 +223,41 @@ struct llama_sampler * common_reasoning_budget_init(
        }
    );
 }
+
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        const std::vector<llama_token> & prefill_tokens) {
+    // Determine initial state from prefill: COUNTING if the prefill begins with
+    // the start sequence but does not also contain the end sequence after it.
+    common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
+    if (!prefill_tokens.empty() && !start_tokens.empty() &&
+            prefill_tokens.size() >= start_tokens.size() &&
+            std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
+        initial_state = REASONING_BUDGET_COUNTING;
+        // If the end sequence also follows the start in the prefill, reasoning
+        // was opened and immediately closed — stay IDLE.
+        if (!end_tokens.empty() &&
+                prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
+            auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
+            if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
+                    std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
+                initial_state = REASONING_BUDGET_IDLE;
+            }
+        }
+    }
+    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
+
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        common_reasoning_budget_state    initial_state) {
+    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@ -24,14 +24,26 @@ enum common_reasoning_budget_state {
 //   DONE:         passthrough forever
 //
 // Parameters:
-//   vocab         - vocabulary (used for UTF-8 boundary detection; can be nullptr)
-//   start_tokens  - token sequence that activates counting
-//   end_tokens    - token sequence for natural deactivation
-//   forced_tokens - token sequence forced when budget expires
-//   budget        - max tokens allowed in the reasoning block
-//   initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
-//                   note: COUNTING with budget <= 0 is promoted to FORCING
+//   vocab          - vocabulary (used for UTF-8 boundary detection; can be nullptr)
+//   start_tokens   - token sequence that activates counting
+//   end_tokens     - token sequence for natural deactivation
+//   forced_tokens  - token sequence forced when budget expires
+//   budget         - max tokens allowed in the reasoning block
+//   prefill_tokens - tokens already present in the prompt (generation prompt);
+//                    used to determine the initial state: COUNTING if they begin
+//                    with start_tokens (but don't also end with end_tokens),
+//                    IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
 //
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        const std::vector<llama_token> & prefill_tokens = {});
+
+// Variant that takes an explicit initial state (used by tests and clone).
+// COUNTING with budget <= 0 is promoted to FORCING.
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -5,9 +5,11 @@
 #include "reasoning-budget.h"

 #include <algorithm>
+#include <cctype>
 #include <cmath>
 #include <cstring>
 #include <unordered_map>
+#include <vector>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@ -251,6 +253,27 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }

+    // Feed grammar prefill tokens to the grammar sampler so it advances past
+    // reasoning markers that the template already placed in the prompt.
+    std::vector<llama_token> prefill_tokens;
+    if (!params.grammar_prefill.empty() && vocab && !params.grammar_external) {
+        prefill_tokens = common_tokenize(vocab, params.grammar_prefill, false, true);
+        if (!prefill_tokens.empty()) {
+            std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
+            if (std::isspace(first_token[0]) && !std::isspace(params.grammar_prefill[0])) {
+                // Some tokenizers will add a space before the first special token, need to remove
+                prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
+            }
+        }
+
+        if (grmr) {
+            for (const auto & token : prefill_tokens) {
+                llama_sampler_accept(grmr, token);
+                LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
+            }
+        }
+    }
+
    // reasoning budget sampler — added first so it can force tokens before other samplers
    if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
        samplers.push_back(common_reasoning_budget_init(
@ -259,7 +282,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
            params.reasoning_budget_tokens,
-            params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
+            prefill_tokens));
    }

    if (params.has_logit_bias()) {
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@ -14,7 +14,7 @@ The unified auto-parser uses a pure differential, compositional approach (inspir
 **Analysis + Parser Building in Two Steps**:

 1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs
-2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
+2. `autoparser::peg_generator::generate_parser(tmpl, generation_params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar

 ## Data Structures

@ -34,7 +34,7 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h

 ### `analyze_tools` and its sub-structs

- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`)
+- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`)
 - [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names
 - [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator`
 - [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values
@ -47,12 +47,21 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
 | Value           | Description                                                                       |
 |-----------------|-----------------------------------------------------------------------------------|
 | `NONE`          | No reasoning markers detected                                                     |
-| `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
-| `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
-| `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
+| `TAG_BASED`     | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats)  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |

+**Generation Prompt & Reasoning Prefill**: Computed in `common_chat_templates_apply_jinja` before invoking either the specialized handlers or the auto-parser, by rendering the template twice — once with `add_generation_prompt=false` and once with `add_generation_prompt=true` — and storing the diff suffix as `generation_params::generation_prompt`. This string is propagated into `common_chat_params::generation_prompt` and `common_chat_parser_params::generation_prompt`.
+
+The generation prompt is prepended to model output before PEG parsing via `wrap_for_generation_prompt()`. The portion *before* the reasoning start marker (if any) is prepended as a literal to ensure any boilerplate added by the template is consumed. The full string is also fed to the grammar sampler via `llama_sampler_accept` (stored in `common_params_sampling::grammar_prefill`), advancing the grammar past tokens already in the prompt. It is used to determine the reasoning budget sampler's initial state — COUNTING if the prefill tokens begin with the reasoning start sequence (but don't also contain the end sequence), IDLE otherwise.
+
+**`grammar_prefill`** (`common_params_sampling`): The generation prompt string tokenized and accepted by the grammar sampler at init time. Only applied when `grammar_external` is false (i.e., the grammar was not set explicitly by the user).
+
+Three outcomes for reasoning-prefill handling (in `generate_parser()`):
+
+1. **Start+end in generation prompt** (e.g. `<think></think>\n`): the parser sees reasoning as opened and immediately closed; whitespace-only reasoning content is discarded.
+2. **Only start in generation prompt** (e.g. `<think>\n`): the parser sees reasoning as already open.
+3. **Start marker present but not at the end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact; the start literal is cleared so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns.
+
 **`content_mode`**: How the template wraps assistant content.

 | Value                    | Description                                                    |
@ -261,16 +270,16 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark

 - Searches `diff.right` (output with reasoning) for the reasoning content needle
 - Uses PEG parsers to find surrounding markers:
-  - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
-  - If both found but post marker only in the full output B → `FORCED_CLOSED`
-  - If only post marker found → `DELIMITER`
+  - If both pre/post markers found in `diff.right` → `TAG_BASED`
+  - If both found but post marker only in the full output B → `TAG_BASED` (template forces markers; handled via prefill)
+  - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
 - Sets `reasoning.start` and `reasoning.end`

 **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.

- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
+- Detects template-added reasoning markers: `enable_thinking=true` appends a non-empty marker → sets `reasoning.start`, mode = `TAG_BASED`
+- Handles the reverse case (`enable_thinking=false` appends the marker instead): extracts both start (from the preceding segment) and end markers; mode = `TAG_BASED`
+- The reasoning prefill (markers added by the template) is later extracted in `common_chat_templates_apply_jinja` and prepended to model output before parsing

 **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.

@ -343,7 +352,7 @@ Classification logic:

 A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds:

-1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')`: sets `reasoning.mode = FORCED_OPEN` with `<think>`/`</think>` markers if no reasoning was detected
+1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')` but not `<SPECIAL_12>`: sets `reasoning.mode = TAG_BASED` with `<think>`/`</think>` markers if no reasoning was detected
 2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with `<think>`/`</think>` and `WRAPPED_WITH_REASONING` content with `<response>`/`</response>`
 3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set
 4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers
@ -355,12 +364,13 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i

 #### Reasoning Parser (`analyze_reasoning::build_parser`)

-| Mode                              | Parser                                                              |
-|-----------------------------------|---------------------------------------------------------------------|
-| Not extracting reasoning          | `eps()`                                                             |
-| `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
-| `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
-| `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |
+| Mode                                          | Parser                                                                    |
+|-----------------------------------------------|---------------------------------------------------------------------------|
+| Not extracting reasoning                      | `eps()`                                                                   |
+| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end + space())`                 |
+| `TAG_BASED` or `TOOLS_ONLY` (empty start)     | `optional(reasoning(until(end)) + end + space())` — delimiter-style       |
+
+Note: The start marker may be empty either because the analyzer detected delimiter-style reasoning, or because `generate_parser()` cleared a template artifact start marker (see Generation Prompt & Reasoning Prefill above). Whitespace-only reasoning content (e.g. from a `<think></think>` prefill) is discarded by the mapper.

 #### Content Parser (`analyze_content::build_parser`)

@ -410,9 +420,7 @@ All three tool parsers return:
 reasoning + optional(content(until(trigger_marker))) + tool_calls + end()
 ```

-### Python Dict Format
-
-When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule.
+Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepends a literal for any boilerplate prefix of the generation prompt (the portion before the reasoning start marker).

 ## Mapper

@ -421,22 +429,22 @@ When `format.uses_python_dicts` is true (detected when single-quoted strings app
 - **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments`
 - **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching
 - **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format)
- **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`)
+- **Whitespace-only reasoning**: Reasoning content that consists entirely of whitespace (e.g. from a `<think></think>` prefill) is cleared so the message shows no reasoning
 - **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically

 ## Files

-| File                                      | Purpose                                                              |
-|-------------------------------------------|----------------------------------------------------------------------|
-| `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` |
-| `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods   |
-| `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                 |
-| `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`,                    |
-|                                           | `compare_variants()`, string helpers                                 |
-| `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers     |
-| `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                   |
-| `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                     |
-| `tools/parser/template-analysis.cpp`      | Template analysis tool                                               |
+| File                                      | Purpose                                                                         |
+|-------------------------------------------|---------------------------------------------------------------------------------|
+| `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `generation_params` |
+| `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods              |
+| `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                            |
+| `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`, `compare_variants()`,         |
+|                                           | `wrap_for_generation_prompt()`, string helpers                                  |
+| `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers                |
+| `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                              |
+| `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                                |
+| `tools/parser/template-analysis.cpp`      | Template analysis tool                                                          |

 ## Testing & Debugging

@ -516,10 +524,10 @@ To support a new template format:

 ## Edge Cases and Quirks

-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
+1. **Generation Prompt & Reasoning Prefill**: The generation prompt is extracted by diffing `add_generation_prompt=false` vs `true` in `common_chat_templates_apply_jinja`, so it contains exactly what the template appends — avoiding false positives from prior conversation turns.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
-3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
-4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
-5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
-6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
-7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
+3. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
+4. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
+5. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
+6. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
+7. **Undetected Tool Format**: If `analyze_tools` concludes tool calling is supported but cannot determine the format, `build_parser()` logs an error and returns `eps()` (graceful degradation) rather than aborting.
--- a/models/templates/Apriel-1.6-15b-Thinker-fixed.jinja
+++ b/models/templates/Apriel-1.6-15b-Thinker-fixed.jinja
@ -7,7 +7,6 @@
 {%- set available_tool_string = '' -%}
 {%- set add_tool_id = true -%}
 {%- set add_thoughts = true -%}            {# whether to include <thinking> reasoning blocks #}
-{%- set add_generation_prompt = true -%}      {# whether to emit reasoning starter before assistant response #}
 {# Optional token placeholders (safe defaults) #}
 {%- set bos_token = bos_token or '' -%}
 {%- set eos_token = eos_token or '' -%}
--- a/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja
+++ b/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja
@ -15,10 +15,10 @@
    {%- set ns.is_tool = false -%}
    {%- for tool in message['tool_calls']-%}
      {%- if not ns.is_first -%}
-        {{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+        {{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] | tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
        {%- set ns.is_first = true -%}
      {%- else -%}
-        {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+        {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] | tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
      {%- endif -%}
    {%- endfor -%}
    {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
--- a/models/templates/deepseek-ai-DeepSeek-V3.1.jinja
+++ b/models/templates/deepseek-ai-DeepSeek-V3.1.jinja
@ -28,25 +28,25 @@
    {%- set ns.is_last_user = true -%}{{'<｜User｜>' + message['content']}}
  {%- endif -%}
  {%- if message['role'] == 'assistant' and message['tool_calls'] -%}
-    {%- if ns.is_last_user -%}{{'<｜Assistant｜></think>'}}
+    {%- if ns.is_last_user -%}{{'<｜Assistant｜><think></think>'}}
    {%- endif -%}
    {%- set ns.is_last_user = false -%}
    {%- set ns.is_first = false -%}
    {%- set ns.is_tool = false -%}
    {%- for tool in message['tool_calls'] -%}
      {%- if not ns.is_first -%}
-        {%- if not message['content'] -%}{{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] + '<｜tool▁call▁end｜>'}}
-          {%- else -%}{{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] + '<｜tool▁call▁end｜>'}}
+        {%- if not message['content'] -%}{{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] | tojson + '<｜tool▁call▁end｜>'}}
+          {%- else -%}{{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] | tojson + '<｜tool▁call▁end｜>'}}
        {%- endif -%}
        {%- set ns.is_first = true -%}
-        {%- else -%}{{'<｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] + '<｜tool▁call▁end｜>'}}
+        {%- else -%}{{'<｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments'] | tojson + '<｜tool▁call▁end｜>'}}
      {%- endif -%}
    {%- endfor -%}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
  {%- endif -%}
  {%- if message['role'] == 'assistant' and not message['tool_calls'] -%}
    {%- if ns.is_last_user -%}{{'<｜Assistant｜>'}}
      {%- if message['prefix'] is defined and message['prefix'] and thinking -%}{{'<think>'}}
-        {%- else -%}{{'</think>'}}
+        {%- else -%}{{'<think></think>'}}
      {%- endif -%}
    {%- endif -%}
    {%- set ns.is_last_user = false -%}
@ -65,7 +65,7 @@
  {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt and ns.is_last_user and not ns.is_tool -%}{{'<｜Assistant｜>'}}
-  {%- if not thinking -%}{{'</think>'}}
-    {%- else -%}{{'<think>'}}
+  {%- if not thinking -%}{{'<think></think>'}}
+  {%- else -%}{{'<think>'}}
  {%- endif -%}
 {%- endif %}
--- a/models/templates/llama-cpp-deepseek-r1.jinja
+++ b/models/templates/llama-cpp-deepseek-r1.jinja
@ -49,7 +49,7 @@ Example function tool call syntax:
            {%- endif -%}
            {%- set tool_name = tc['function']['name'] -%}
            {%- set tool_args = tc['function']['arguments'] -%}
-            {{- '<｜tool▁call▁begin｜>' + tc['type'] + '<｜tool▁sep｜>' + tool_name + '\n' + '```json' + '\n' + tool_args + '\n' + '```' + '<｜tool▁call▁end｜>' -}}
+            {{- '<｜tool▁call▁begin｜>' + tc['type'] + '<｜tool▁sep｜>' + tool_name + '\n' + '```json' + '\n' + tool_args | tojson + '\n' + '```' + '<｜tool▁call▁end｜>' -}}
        {%- endfor -%}
        {{- '<｜tool▁calls▁end｜><｜end▁of▁sentence｜>' -}}
    {%- endif -%}
--- a/models/templates/meetkai-functionary-medium-v3.1.jinja
+++ b/models/templates/meetkai-functionary-medium-v3.1.jinja
@ -42,9 +42,9 @@
        {%- if 'tool_calls' in message and message['tool_calls'] -%}
            {%- for tool_call in message['tool_calls'] -%}
                {%- if tool_call["function"]["name"] == "python" -%}
-                    {{ '<|python_tag|>' + tool_call['function']['arguments'] }}
+                    {{ '<|python_tag|>' + tool_call['function']['arguments'] | tojson }}
                {%- else -%}
-                    {{ '<function=' + tool_call['function']['name'] + '>' + tool_call['function']['arguments'] + '</function>' }}
+                    {{ '<function=' + tool_call['function']['name'] + '>' + tool_call['function']['arguments'] | tojson + '</function>' }}
                {%- endif -%}
            {%- endfor -%}
            {{ '<|eom_id|>' }}
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@ -1292,11 +1292,11 @@ static void test_nemotron_reasoning_detection(testing & t) {

    // Check reasoning markers
    t.assert_equal("reasoning_start should be '<think>'", "<think>", analysis.reasoning.start);
-    t.assert_equal("reasoning_end should be '</think>\\n'", "</think>\n", analysis.reasoning.end);
+    t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);

    // Check reasoning mode detection
-    // Nemotron uses forced closed reasoning with add_generation_prompt
-    t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
+    // Nemotron uses tag-based reasoning; prefill handles the template's forced markers
+    t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);

    // Make sure reasoning markers don't spill over to content markers
    t.assert_equal("content start should be empty", "", analysis.content.start);
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
        common_reasoning_format reasoning_format;
        json                    json_schema;
        bool                    parallel_tool_calls;
-        bool                    thinking_forced_open;
+        std::string             generation_prompt;
        std::string             input;

        // Expect
@ -157,14 +157,8 @@ static void test_example_native(testing & t) {
    auto build_parser = [](const test_case & tc) {
        return build_chat_peg_parser([&](common_chat_peg_builder & p) {
            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
-            auto reasoning            = p.eps();
-            if (tc.thinking_forced_open) {
-                // If thinking is forced open, expect a closing tag
-                reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
-            } else {
-                // Otherwise, optionally accept thinking wrapped in tags
-                reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
-            }
+            // Always use optional TAG_BASED pattern; generation_prompt is prepended to input
+            auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());

            // tool calling parser
            if (tc.tools.is_array() && !tc.tools.empty()) {
@ -190,78 +184,91 @@ static void test_example_native(testing & t) {

    std::vector<test_case> test_cases = std::vector<test_case>{
        {
-         /* .name =                 */ "content with thinking_forced_open = false",
+         /* .name =                 */ "content with reasoning (no generation_prompt)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .generation_prompt =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = false and no reasoning",
+         /* .name =                 */ "content without reasoning (no generation_prompt)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .generation_prompt =    */ "",
         /* .input =                */ ("Hello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = false and reasoning_format = none",
+         /* .name =                 */ "content with reasoning_format = none (tags appear in content)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = true",
+         /* .name =                 */ "content with reasoning generation_prompt",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "<think>",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = true and reasoning_format = none",
+         /* .name =                 */ "content with reasoning generation_prompt and reasoning_format = none",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "tools with tool_choice = auto and no parallel_tool_calls",
+         /* .name =                 */ "content with closed reasoning generation_prompt (empty reasoning discarded)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "<think></think>",
+         /* .input =                */ ("Hello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "tools with reasoning generation_prompt",
         /* .tools =                */ create_tools(),
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York</think>\n"
             "<tool_call>["
@ -277,13 +284,13 @@ static void test_example_native(testing & t) {
            } },
         },
        {
-         /* .name =                 */ "tools with tool_choice = auto and parallel_tool_calls",
+         /* .name =                 */ "parallel tools with reasoning generation_prompt",
         /* .tools =                */ create_tools(),
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ true,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
             "search that for you."
@ -321,7 +328,7 @@ static void test_example_native(testing & t) {
              } },
         },
        {
-         /* .name =                 */ "response_format with thinking_forced_open = true",
+         /* .name =                 */ "response_format with reasoning generation_prompt",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
@ -333,7 +340,7 @@ static void test_example_native(testing & t) {
                  { "due_date", { { "type", "string" } } } } },
              { "required", { "invoice_number", "amount", "due_date" } } },
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .generation_prompt =    */ "<think>",
         /* .input =                */
            ("I must produce the invoice in the requested format</think>\n"
             R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@ -361,7 +368,8 @@ static void test_example_native(testing & t) {
                t.log(line);
            }

-            common_peg_parse_context ctx(tc.input);
+            std::string              effective_input = tc.generation_prompt + tc.input;
+            common_peg_parse_context ctx(effective_input);
            auto                     result = parser.parse(ctx);

            t.assert_true("success", result.success());
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -822,8 +822,7 @@ struct make_peg_parser {
    }

    common_chat_msg parse(const std::string & msg, bool is_partial) const {
-        common_chat_parser_params parser_params;
-        parser_params.format = params_.format;
+        common_chat_parser_params parser_params(params_);
        parser_params.debug = detailed_debug_;
        return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
    }
@ -996,6 +995,16 @@ static void test_peg_parser(common_chat_templates *                      tmpls,
            grammar_triggered = true;
        }

+        // For non-lazy grammars, prepend reasoning prefill to grammar input, just like
+        // PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
+        // <think>...</think>), but the model output may start mid-reasoning if the template
+        // already placed the opening tag in the prompt.
+        // For lazy grammars, the grammar only activates from the trigger position, so the
+        // reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
+        if (!parser.params_.generation_prompt.empty() && earliest_trigger_pos == std::string::npos) {
+            constrained = parser.params_.generation_prompt + constrained;
+        }
+
        // Test the constrained portion against the grammar
        if (grammar_triggered && !tc.is_partial) {
            auto result = match_string_detailed(constrained, grammar.get());
@ -1317,12 +1326,15 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
        // NVIDIA Nemotron-3 Nano
        auto tst = peg_tester("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja", detailed_debug);

-        tst.test("Hello, world!\nWhat's up?").enable_thinking(false).expect(message_assist).run();
+        tst.test("Hello, world!\nWhat's up?").
+            enable_thinking(false).
+            reasoning_format(COMMON_REASONING_FORMAT_AUTO).
+            expect(message_assist).run();

        tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
-            .enable_thinking(false)
+            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_NONE)
-            .expect_content("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
+            .expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
            .run();

        tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
@ -1482,7 +1494,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .expect(simple_assist_msg("The answer is 42.", "Let me think about this..."))
            .run();

-        tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
+        tst.test("</think>Hello, world!").reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(simple_assist_msg("Hello, world!")).run();
    }
    {
        // NousResearch-Hermes-2-Pro and Hermes-3 (tool calling models)
@ -1798,6 +1810,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": "
               "\"XYZCITY\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>")
            .tools({ get_time_tool })
+            .enable_thinking(false)
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
            .run();
    }
@ -1843,7 +1857,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

    {
        auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
-        tst.test("CONTENT").expect(simple_assist_msg("CONTENT", "")).run();
+        tst.test("CONTENT").enable_thinking(false).reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).
+            expect(simple_assist_msg("CONTENT", "")).run();
    }

    // GLM-4.6 tests - format: <tool_call>function_name\n<arg_key>...</arg_key>\n<arg_value>...</arg_value>\n</tool_call>
@ -1906,6 +1921,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<arg_key>arg1</arg_key><arg_value>1</arg_value>"
               "<arg_key>arg2</arg_key><arg_value>2</arg_value>"
               "</tool_call>")
+            .enable_thinking(false)
            .parallel_tool_calls(true)
            .tools({
                special_function_tool, special_function_tool_with_optional_param
@ -2222,10 +2238,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    {
        auto tst = peg_tester("models/templates/MiniMax-M2.jinja", detailed_debug);
        tst.test(
-               "<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
+               "</think><minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
               "name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>")
            .tools({ special_function_tool })
            .expect(message_assist_call)
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .run();
    }

@ -2288,8 +2305,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // Functionary v3.2 - recipient-based format: >>>recipient\n{content}
    {
        auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.2.jinja", detailed_debug);
-        tst.test(">>>all\nHello, world!\nWhat's up?").expect(message_assist).run();
-        tst.test(">>>special_function\n{\"arg1\": 1}")
+        tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).run();
+        tst.test("special_function\n{\"arg1\": 1}")
            .tools({ special_function_tool })
            .expect(message_assist_call)
            .run();
@ -2309,8 +2326,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // Note: Template uses forced-open mode (prompt ends with <think>), so input shouldn't include opening tag
    {
        auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja", detailed_debug);
-        tst.test("Hello, world!\nWhat's up?")
-            .enable_thinking(true)  // Forced open
+        tst.test("</think>Hello, world!\nWhat's up?")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .expect(message_assist)
            .run();
        tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
@ -2322,14 +2339,15 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // llama-cpp DeepSeek R1 template (always forced-open thinking)
    {
        auto tst = peg_tester("models/templates/llama-cpp-deepseek-r1.jinja", detailed_debug);
-        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+        tst.test("</think>Hello, world!\nWhat's up?").expect(message_assist).reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).run();
        tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .expect(message_assist_thoughts)
            .run();
        tst.test(
-               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+               "</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
               "```json\n{\"arg1\": 1}```<｜tool▁call▁end｜><｜tool▁calls▁end｜>")
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .tools({ special_function_tool })
            .parallel_tool_calls(true)
            .expect(message_assist_call)
@ -2339,7 +2357,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // Note: Template uses forced-open mode (prompt ends with <think>), so input shouldn't include opening tag
    {
        auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja", detailed_debug);
-        tst.test("Hello, world!\nWhat's up?").enable_thinking(true).expect(message_assist).run();
+        tst.test("</think>Hello, world!\nWhat's up?").enable_thinking(true).
+            reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).
+            expect(message_assist).run();
        tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .expect(message_assist_thoughts)
@ -2348,6 +2368,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
               "```json\n{\"arg1\": 1}```<｜tool▁call▁end｜><｜tool▁calls▁end｜>")
            .tools({ special_function_tool })
+            .enable_thinking(false)
+            .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
            .expect(message_assist_call)
            .run();
    }
@ -2377,12 +2399,12 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // Apriel 1.6 Thinker (reasoning-only support)
    {
        auto tst = peg_tester("models/templates/Apriel-1.6-15b-Thinker-fixed.jinja", detailed_debug);
-        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();

        // Implicit reasoning start (forced open)
        tst.test("I'm\nthinking\n[BEGIN FINAL RESPONSE]\nHello, world!\nWhat's up?")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-            .expect(message_assist_thoughts)
+            .enable_thinking(true)
+            .expect(simple_assist_msg("Hello, world!\nWhat's up?", "Here are my reasoning steps:\nI'm\nthinking"))
            .run();

        // Reasoning + Tool calls
@ -2390,8 +2412,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "I'm\nthinking\n[BEGIN FINAL RESPONSE]\n<tool_calls>[{\"name\": \"special_function\", \"arguments\": "
               "{\"arg1\": 1}}]</tool_calls>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
            .tools({ special_function_tool })
-            .expect(message_assist_call_thoughts)
+            .expect(simple_assist_msg("", "Here are my reasoning steps:\nI'm\nthinking", "special_function", "{\"arg1\":1}"))
            .run();
    }

--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@ -105,7 +105,7 @@ struct cli_context {
                    llama_get_model(ctx_server.get_llama_context()));

                task.params.sampling.reasoning_budget_tokens = reasoning_budget;
-                task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
+                task.params.sampling.grammar_prefill = chat_params.generation_prompt;

                if (!chat_params.thinking_start_tag.empty()) {
                    task.params.sampling.reasoning_budget_start =
@ -215,7 +215,7 @@ struct cli_context {
        inputs.parallel_tool_calls   = false;
        inputs.add_generation_prompt = true;
        inputs.reasoning_format      = COMMON_REASONING_FORMAT_DEEPSEEK;
-        inputs.enable_thinking       = common_chat_templates_support_enable_thinking(chat_params.tmpls.get());
+        inputs.enable_thinking       = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;

        // Apply chat template to the list of messages
        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
--- a/tools/parser/debug-template-parser.cpp
+++ b/tools/parser/debug-template-parser.cpp
@ -282,7 +282,7 @@ static void render_scenario(const common_chat_template & tmpl,
    LOG_ERR("Messages:\n%s\n", final_messages.dump(2).c_str());

    try {
-        autoparser::templates_params inputs;
+        autoparser::generation_params inputs;
        inputs.messages                         = final_messages;
        inputs.add_generation_prompt            = add_generation_prompt;
        inputs.extra_context["enable_thinking"] = enable_thinking;
@ -395,7 +395,7 @@ int main(int argc, char ** argv) {
            analysis.analyze_template(chat_template);

            // Generate Parser
-            autoparser::templates_params params;
+            autoparser::generation_params params;
            params.messages = json::array({ build_user_message() });
            params.reasoning_format =
                opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
--- a/tools/parser/template-analysis.cpp
+++ b/tools/parser/template-analysis.cpp
@ -400,12 +400,12 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_no_tools;
+            autoparser::generation_params params_no_tools;
            params_no_tools.messages = json::array({ user_msg });
            params_no_tools.add_generation_prompt = false;
            params_no_tools.tools = json::array();

-            autoparser::templates_params params_with_tools = params_no_tools;
+            autoparser::generation_params params_with_tools = params_no_tools;
            params_with_tools.tools = tools;

            std::string output_no_tools = common_chat_template_direct_apply(chat_template, params_no_tools);
@ -419,12 +419,12 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_no_prompt;
+            autoparser::generation_params params_no_prompt;
            params_no_prompt.messages = json::array({ user_msg });
            params_no_prompt.add_generation_prompt = false;
            params_no_prompt.tools = json::array();

-            autoparser::templates_params params_with_prompt = params_no_prompt;
+            autoparser::generation_params params_with_prompt = params_no_prompt;
            params_with_prompt.add_generation_prompt = true;

            std::string output_no_prompt = common_chat_template_direct_apply(chat_template, params_no_prompt);
@ -438,12 +438,12 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_no_reasoning;
+            autoparser::generation_params params_no_reasoning;
            params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning() });
            params_no_reasoning.add_generation_prompt = false;
            params_no_reasoning.enable_thinking = true;

-            autoparser::templates_params params_with_reasoning = params_no_reasoning;
+            autoparser::generation_params params_with_reasoning = params_no_reasoning;
            params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning() });

            std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
@ -458,12 +458,12 @@ static void analyze_template(const std::string & template_path) {
            json user_msg = make_user_msg();
            json user_msg2 = make_user_msg2();

-            autoparser::templates_params params_no_reasoning;
+            autoparser::generation_params params_no_reasoning;
            params_no_reasoning.messages = json::array({ user_msg, make_assistant_no_reasoning(), user_msg2 });
            params_no_reasoning.add_generation_prompt = false;
            params_no_reasoning.enable_thinking = true;

-            autoparser::templates_params params_with_reasoning = params_no_reasoning;
+            autoparser::generation_params params_with_reasoning = params_no_reasoning;
            params_with_reasoning.messages = json::array({ user_msg, make_assistant_with_reasoning(), user_msg2 });

            std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
@ -477,12 +477,12 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_no_tool;
+            autoparser::generation_params params_no_tool;
            params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool() });
            params_no_tool.add_generation_prompt = false;
            params_no_tool.tools = tools;

-            autoparser::templates_params params_with_tool = params_no_tool;
+            autoparser::generation_params params_with_tool = params_no_tool;
            params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool() });

            std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
@ -497,12 +497,12 @@ static void analyze_template(const std::string & template_path) {
            json user_msg = make_user_msg();
            json user_msg2 = make_user_msg2_continue();

-            autoparser::templates_params params_no_tool;
+            autoparser::generation_params params_no_tool;
            params_no_tool.messages = json::array({ user_msg, make_assistant_no_tool(), user_msg2 });
            params_no_tool.add_generation_prompt = false;
            params_no_tool.tools = tools;

-            autoparser::templates_params params_with_tool = params_no_tool;
+            autoparser::generation_params params_with_tool = params_no_tool;
            params_with_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });

            std::string output_no_tool = common_chat_template_direct_apply(chat_template, params_no_tool);
@ -516,12 +516,12 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_one_tool;
+            autoparser::generation_params params_one_tool;
            params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool() });
            params_one_tool.add_generation_prompt = false;
            params_one_tool.tools = tools;

-            autoparser::templates_params params_two_tools = params_one_tool;
+            autoparser::generation_params params_two_tools = params_one_tool;
            params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools() });

            std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
@ -536,12 +536,12 @@ static void analyze_template(const std::string & template_path) {
            json user_msg = make_user_msg();
            json user_msg2 = make_user_msg2_continue();

-            autoparser::templates_params params_one_tool;
+            autoparser::generation_params params_one_tool;
            params_one_tool.messages = json::array({ user_msg, make_assistant_one_tool(), user_msg2 });
            params_one_tool.add_generation_prompt = false;
            params_one_tool.tools = tools;

-            autoparser::templates_params params_two_tools = params_one_tool;
+            autoparser::generation_params params_two_tools = params_one_tool;
            params_two_tools.messages = json::array({ user_msg, make_assistant_two_tools(), user_msg2 });

            std::string output_one_tool = common_chat_template_direct_apply(chat_template, params_one_tool);
@ -555,13 +555,13 @@ static void analyze_template(const std::string & template_path) {
        {
            json user_msg = make_user_msg();

-            autoparser::templates_params params_no_reasoning;
+            autoparser::generation_params params_no_reasoning;
            params_no_reasoning.messages = json::array({ user_msg, make_assistant_one_tool() });
            params_no_reasoning.add_generation_prompt = false;
            params_no_reasoning.tools = tools;
            params_no_reasoning.enable_thinking = true;

-            autoparser::templates_params params_with_reasoning = params_no_reasoning;
+            autoparser::generation_params params_with_reasoning = params_no_reasoning;
            params_with_reasoning.messages = json::array({ user_msg, make_assistant_one_tool_with_reasoning() });

            std::string output_no_reasoning = common_chat_template_direct_apply(chat_template, params_no_reasoning);
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "generation_prompt": "",
      "samplers": [
        "penalties",
        "dry",
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "generation_prompt": "",
      "samplers": [
        "penalties",
        "dry",
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":

 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.

-`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+`generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.

 `parse_tool_calls`: Whether to parse the generated tool call.

--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1080,20 +1080,21 @@ json oaicompat_chat_params_parse(
        }
    }

-    llama_params["chat_format"]      = static_cast<int>(chat_params.format);
-    llama_params["prompt"]           = chat_params.prompt;
+    llama_params["chat_format"] = static_cast<int>(chat_params.format);
+    llama_params["prompt"]      = chat_params.prompt;
    if (!chat_params.grammar.empty()) {
        llama_params["grammar"] = chat_params.grammar;
    }
    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
-    auto grammar_triggers = json::array();
+    llama_params["grammar_external"] = body.contains("grammar");
+    auto grammar_triggers        = json::array();
    for (const auto & trigger : chat_params.grammar_triggers) {
        server_grammar_trigger ct(trigger);
        grammar_triggers.push_back(ct.to_json());
    }
-    llama_params["grammar_triggers"] = grammar_triggers;
-    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
-    llama_params["thinking_forced_open"]     = chat_params.thinking_forced_open;
+    llama_params["grammar_triggers"]  = grammar_triggers;
+    llama_params["preserved_tokens"]  = chat_params.preserved_tokens;
+    llama_params["generation_prompt"] = chat_params.generation_prompt;
    for (const auto & stop : chat_params.additional_stops) {
        llama_params["stop"].push_back(stop);
    }
@ -1113,7 +1114,6 @@ json oaicompat_chat_params_parse(
            llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
            llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
            llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
-            llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
        }
    }

--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -15,6 +15,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <cinttypes>
+#include <exception>
 #include <memory>
 #include <filesystem>

@ -1151,10 +1152,19 @@ private:

        // initialize samplers
        if (task.need_sampling()) {
-            slot.smpl.reset(common_sampler_init(model, task.params.sampling));
+            try {
+                slot.smpl.reset(common_sampler_init(model, task.params.sampling));
+            } catch (std::exception & e) {
+                LOG_ERR("%s: error initializing samplers. Grammar was:\n%s\n\nGrammar prefill:\n'%s'\n", __func__,
+                    task.params.sampling.grammar.c_str(), task.params.sampling.grammar_prefill.c_str());
+                std::string err_msg = std::string("Failed to initialize samplers: ") + e.what();
+                send_error(task, err_msg, ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }

            if (slot.smpl == nullptr) {
-                // for now, the only error that may happen here is invalid grammar
+                LOG_ERR("%s: error in parsing grammar. Grammar was:\n%s\n\nGrammar prefill:\n'%s'\n", __func__,
+                    task.params.sampling.grammar.c_str(), task.params.sampling.grammar_prefill.c_str());
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -38,53 +38,53 @@ json task_params::to_json(bool only_metrics) const {
    }

    if (only_metrics) {
-        return json {
-            {"seed",                      sampling.seed},
-            {"temperature",               sampling.temp},
-            {"dynatemp_range",            sampling.dynatemp_range},
-            {"dynatemp_exponent",         sampling.dynatemp_exponent},
-            {"top_k",                     sampling.top_k},
-            {"top_p",                     sampling.top_p},
-            {"min_p",                     sampling.min_p},
-            {"top_n_sigma",               sampling.top_n_sigma},
-            {"xtc_probability",           sampling.xtc_probability},
-            {"xtc_threshold",             sampling.xtc_threshold},
-            {"typical_p",                 sampling.typ_p},
-            {"repeat_last_n",             sampling.penalty_last_n},
-            {"repeat_penalty",            sampling.penalty_repeat},
-            {"presence_penalty",          sampling.penalty_present},
-            {"frequency_penalty",         sampling.penalty_freq},
-            {"dry_multiplier",            sampling.dry_multiplier},
-            {"dry_base",                  sampling.dry_base},
-            {"dry_allowed_length",        sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
-            {"mirostat",                  sampling.mirostat},
-            {"mirostat_tau",              sampling.mirostat_tau},
-            {"mirostat_eta",              sampling.mirostat_eta},
-            {"max_tokens",                n_predict},
-            {"n_predict",                 n_predict}, // TODO: deduplicate?
-            {"n_keep",                    n_keep},
-            {"n_discard",                 n_discard},
-            {"ignore_eos",                sampling.ignore_eos},
-            {"stream",                    stream},
-            {"n_probs",                   sampling.n_probs},
-            {"min_keep",                  sampling.min_keep},
-            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
-            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
-            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
-            {"samplers",                  samplers},
-            {"speculative.n_max",         speculative.n_max},
-            {"speculative.n_min",         speculative.n_min},
-            {"speculative.p_min",         speculative.p_min},
-            {"speculative.type",          common_speculative_type_to_str(speculative.type)},
-            {"speculative.ngram_size_n",  speculative.ngram_size_n},
-            {"speculative.ngram_size_m",  speculative.ngram_size_m},
-            {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
-            {"timings_per_token",         timings_per_token},
-            {"post_sampling_probs",       post_sampling_probs},
-            {"backend_sampling",          sampling.backend_sampling},
-            {"lora",                      lora},
+        return json{
+            { "seed",                     sampling.seed                                                     },
+            { "temperature",              sampling.temp                                                     },
+            { "dynatemp_range",           sampling.dynatemp_range                                           },
+            { "dynatemp_exponent",        sampling.dynatemp_exponent                                        },
+            { "top_k",                    sampling.top_k                                                    },
+            { "top_p",                    sampling.top_p                                                    },
+            { "min_p",                    sampling.min_p                                                    },
+            { "top_n_sigma",              sampling.top_n_sigma                                              },
+            { "xtc_probability",          sampling.xtc_probability                                          },
+            { "xtc_threshold",            sampling.xtc_threshold                                            },
+            { "typical_p",                sampling.typ_p                                                    },
+            { "repeat_last_n",            sampling.penalty_last_n                                           },
+            { "repeat_penalty",           sampling.penalty_repeat                                           },
+            { "presence_penalty",         sampling.penalty_present                                          },
+            { "frequency_penalty",        sampling.penalty_freq                                             },
+            { "dry_multiplier",           sampling.dry_multiplier                                           },
+            { "dry_base",                 sampling.dry_base                                                 },
+            { "dry_allowed_length",       sampling.dry_allowed_length                                       },
+            { "dry_penalty_last_n",       sampling.dry_penalty_last_n                                       },
+            { "mirostat",                 sampling.mirostat                                                 },
+            { "mirostat_tau",             sampling.mirostat_tau                                             },
+            { "mirostat_eta",             sampling.mirostat_eta                                             },
+            { "max_tokens",               n_predict                                                         },
+            { "n_predict",                n_predict                                                         }, // TODO: deduplicate?
+            { "n_keep",                   n_keep                                                            },
+            { "n_discard",                n_discard                                                         },
+            { "ignore_eos",               sampling.ignore_eos                                               },
+            { "stream",                   stream                                                            },
+            { "n_probs",                  sampling.n_probs                                                  },
+            { "min_keep",                 sampling.min_keep                                                 },
+            { "chat_format",              common_chat_format_name(chat_parser_params.format)                },
+            { "reasoning_format",         common_reasoning_format_name(chat_parser_params.reasoning_format) },
+            { "reasoning_in_content",     chat_parser_params.reasoning_in_content                           },
+            { "generation_prompt",        chat_parser_params.generation_prompt                              },
+            { "samplers",                 samplers                                                          },
+            { "speculative.n_max",        speculative.n_max                                                 },
+            { "speculative.n_min",        speculative.n_min                                                 },
+            { "speculative.p_min",        speculative.p_min                                                 },
+            { "speculative.type",         common_speculative_type_to_str(speculative.type)                  },
+            { "speculative.ngram_size_n", speculative.ngram_size_n                                          },
+            { "speculative.ngram_size_m", speculative.ngram_size_m                                          },
+            { "speculative.ngram_m_hits", speculative.ngram_min_hits                                        },
+            { "timings_per_token",        timings_per_token                                                 },
+            { "post_sampling_probs",      post_sampling_probs                                               },
+            { "backend_sampling",         sampling.backend_sampling                                         },
+            { "lora",                     lora                                                              },
        };
    }

@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+        {"generation_prompt",         chat_parser_params.generation_prompt},
        {"samplers",                  samplers},
        {"speculative.n_max",         speculative.n_max},
        {"speculative.n_min",         speculative.n_min},
@ -382,7 +382,8 @@ task_params server_task::params_from_json_cmpl(
            throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
        }
    } else {
-        params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
+        params.sampling.grammar          = json_value(data, "grammar", defaults.sampling.grammar);
+        params.sampling.grammar_external = json_value(data, "grammar_external", params.sampling.grammar_external);
        SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
        params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
        SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
@ -402,7 +403,8 @@ task_params server_task::params_from_json_cmpl(
        }
        params.chat_parser_params.reasoning_format = reasoning_format;
        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
+        params.sampling.grammar_prefill = params.chat_parser_params.generation_prompt;
        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
        if (data.contains("chat_parser")) {
            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
@ -469,10 +471,7 @@ task_params server_task::params_from_json_cmpl(
            const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
            const auto end_tag   = json_value(data, "reasoning_budget_end_tag", std::string());
            const auto message   = json_value(data, "reasoning_budget_message", std::string());
-            const bool activate_imm   = json_value(data, "reasoning_budget_activate_immediately", false);
-
            params.sampling.reasoning_budget_tokens = budget;
-            params.sampling.reasoning_budget_activate_immediately = activate_imm;

            if (!start_tag.empty()) {
                params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
@ -482,8 +481,8 @@ task_params server_task::params_from_json_cmpl(
                params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
            }

-            SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
-                budget, activate_imm ? "true" : "false",
+            SRV_DBG("reasoning budget: tokens=%d, grammar_prefill='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
+                budget, params.sampling.grammar_prefill.c_str(),
                params.sampling.reasoning_budget_start.size(),
                params.sampling.reasoning_budget_end.size(),
                params.sampling.reasoning_budget_forced.size());
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@ -210,6 +210,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
 def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
    global server
    server.jinja = jinja
+    server.debug = True
    server.start()
    res = server.make_request("POST", "/chat/completions", data={
        "max_tokens": n_predicted,
--- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				generation_prompt: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
 			chat_format: string;
 			reasoning_format: string;
 			reasoning_in_content: boolean;
-			thinking_forced_open: boolean;
+			generation_prompt: string;
 			samplers: string[];
 			backend_sampling: boolean;
 			'speculative.n_max': number;
@ -332,7 +332,7 @@ export interface ApiSlotData {
 		chat_format: string;
 		reasoning_format: string;
 		reasoning_in_content: boolean;
-		thinking_forced_open: boolean;
+		prefill: string;
 		samplers: string[];
 		backend_sampling: boolean;
 		'speculative.n_max': number;