Merge 4343ae3d65 into 2ba9adc093

2026-02-16 15:55:30 +02:00 · 2026-02-16 15:55:30 +02:00 · 1f84fb9112
parent 2ba9adc093 4343ae3d65
commit 1f84fb9112
5 changed files with 82 additions and 20 deletions
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@ -498,9 +498,42 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct
            }
            auto val_start = builder.pos();

-            // Test if arg_val is a partial JSON
+            // vLLM-style: only try to parse value when there is content; empty/whitespace = incomplete (avoids "parse empty input" log).
+            // When remainder does not look like JSON start, skip try_consume_json and fall through to plain-text path (e.g. "explore").
+            bool looks_like_json = true;
+            {
+                const auto & inp = builder.input();
+                const size_t rem_len = (val_start < inp.size()) ? (inp.size() - val_start) : 0;
+                std::string_view rest_sv(inp.data() + val_start, rem_len);
+                if (rest_sv.empty() || all_space(rest_sv)) {
+                    gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; });
+                    throw common_chat_msg_partial_exception(
+                        "Expected " + gbnf_format_literal(form.val_end) +
+                        " after " + gbnf_format_literal(form.key_val_sep) +
+                        (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                    );
+                }
+                // Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log).
+                // Otherwise fall through to plain-text path (e.g. subagent_type=explore).
+                size_t pos = 0;
+                while (pos < rest_sv.size() && std::isspace(static_cast<unsigned char>(rest_sv[pos]))) { ++pos; }
+                if (pos >= rest_sv.size()) {
+                    looks_like_json = false;
+                } else {
+                    std::string_view rest_trim = rest_sv.substr(pos);
+                    char c = rest_trim[0];
+                    looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-');
+                    if (!looks_like_json) {
+                        if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim);
+                        else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim);
+                        else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim);
+                    }
+                }
+            }
+
+            // Test if arg_val is a partial JSON (only when remainder looks like JSON; else plain-text path below)
            std::optional<common_json> value_json = std::nullopt;
-            if (!form.raw_argval || !*form.raw_argval) {
+            if ((!form.raw_argval || !*form.raw_argval) && looks_like_json) {
                try { value_json = builder.try_consume_json(); }
                catch (const std::runtime_error&) { builder.move_to(val_start); }
                // TODO: Delete this when json_partial adds top-level support for null/true/false
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -2169,7 +2169,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

 static common_chat_params common_chat_params_init_glm_4_5(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
-    data.grammar_lazy = inputs.tools.is_array() && !inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    // vLLM-style for AUTO: no grammar/trigger during generation; tool calls are parsed from decoded text (common_chat_parse_glm_4_5).
+    // Only use grammar when tool_choice == REQUIRED (force tool call from first token).
+    data.grammar_lazy = false;

    std::string prompt = apply(tmpl, inputs);

@ -2228,18 +2230,22 @@ static common_chat_params common_chat_params_init_glm_4_5(const common_chat_temp
        "<|observation|>"
    });

-    // build grammar for tool call
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "",
-        /* form.tool_start  = */ "\n<tool_call>",
-        /* form.tool_sep    = */ "\n",
-        /* form.key_start   = */ "<arg_key>",
-        /* form.key_val_sep = */ "</arg_key>\n<arg_value>",
-        /* form.val_end     = */ "</arg_value>\n",
-        /* form.tool_end    = */ "</tool_call>\n",
-        /* form.scope_end   = */ "",
-    };
-    build_grammar_xml_tool_call(data, inputs.tools, form);
+    // Build grammar only for tool_choice == REQUIRED (force tool call from first token).
+    // For AUTO, generate freely and parse tool calls from decoded text (common_chat_parse_glm_4_5).
+    const bool has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+    if (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
+        static const xml_tool_call_format form {
+            /* form.scope_start = */ "",
+            /* form.tool_start  = */ "\n<tool_call>",
+            /* form.tool_sep    = */ "\n",
+            /* form.key_start   = */ "<arg_key>",
+            /* form.key_val_sep = */ "</arg_key>\n<arg_value>",
+            /* form.val_end     = */ "</arg_value>\n",
+            /* form.tool_end    = */ "</tool_call>\n",
+            /* form.scope_end   = */ "",
+        };
+        build_grammar_xml_tool_call(data, inputs.tools, form);
+    }

    data.prompt = prompt;
    data.format = COMMON_CHAT_FORMAT_GLM_4_5;
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@ -120,6 +120,11 @@ bool common_json_parse(
        auto temptative_end = it + err_loc.position;
        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());

+        // Avoid parsing and logging "empty input" when error is at position 0 (e.g. streaming partial/invalid JSON)
+        if (temptative_end == it) {
+            return false;
+        }
+
        auto input = std::string(it, temptative_end);
        try {
            out.json = json::parse(input);
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -349,9 +349,7 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
            assert_msg_equals(test_message, msg, ignore_whitespace_differences);
        }

-        if (!test_message.tool_calls.empty()) {
-            GGML_ASSERT(!data.params.grammar.empty());
-        }
+        // GLM 4.5 with tool_choice=AUTO uses parse-only (no grammar); other formats set grammar when tools present
        if (!data.params.grammar.empty()) {
            auto grammar = build_grammar(data.params.grammar);
            if (!grammar) {
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -2747,7 +2747,17 @@ private:

                slot.i_batch = -1;

-                common_sampler_accept(slot.smpl.get(), id, true);
+                try {
+                    common_sampler_accept(slot.smpl.get(), id, true);
+                } catch (const std::runtime_error & e) {
+                    // Grammar constraint violation (e.g. "Unexpected empty grammar stack") - return 500 instead of aborting
+                    SRV_ERR("slot %d: grammar error, releasing slot: %s\n", slot.id, e.what());
+                    send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
+                    slot.print_timings();
+                    metrics.on_prediction(slot);
+                    slot.release();
+                    continue;
+                }

                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
                const int64_t t_current = ggml_time_us();
@ -2791,7 +2801,17 @@ private:
                const size_t n_draft = slot.drafted.size();

                // the accepted tokens from the speculation
-                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
+                std::vector<llama_token> ids;
+                try {
+                    ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
+                } catch (const std::runtime_error & e) {
+                    SRV_ERR("slot %d: grammar error during speculative decoding, releasing slot: %s\n", slot.id, e.what());
+                    send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
+                    slot.print_timings();
+                    metrics.on_prediction(slot);
+                    slot.release();
+                    continue;
+                }
                slot.i_batch_dft.clear();
                slot.drafted.clear();