diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index d7e54cabfa..e4bbfaddd5 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -498,7 +498,9 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct } auto val_start = builder.pos(); - // vLLM-style: only try to parse value when there is content; empty/whitespace or non-JSON start = incomplete (avoids "parse empty input" log) + // vLLM-style: only try to parse value when there is content; empty/whitespace = incomplete (avoids "parse empty input" log). + // When remainder does not look like JSON start, skip try_consume_json and fall through to plain-text path (e.g. "explore"). + bool looks_like_json = true; { const auto & inp = builder.input(); const size_t rem_len = (val_start < inp.size()) ? (inp.size() - val_start) : 0; @@ -511,38 +513,27 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "") ); } - // Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log) + // Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log). + // Otherwise fall through to plain-text path (e.g. subagent_type=explore). size_t pos = 0; while (pos < rest_sv.size() && std::isspace(static_cast(rest_sv[pos]))) { ++pos; } if (pos >= rest_sv.size()) { - gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; }); - throw common_chat_msg_partial_exception( - "Expected " + gbnf_format_literal(form.val_end) + - " after " + gbnf_format_literal(form.key_val_sep) + - (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "") - ); - } - std::string_view rest_trim = rest_sv.substr(pos); - char c = rest_trim[0]; - bool looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-'); - if (!looks_like_json) { - if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim); - else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim); - else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim); - } - if (!looks_like_json) { - gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; }); - throw common_chat_msg_partial_exception( - "Expected " + gbnf_format_literal(form.val_end) + - " after " + gbnf_format_literal(form.key_val_sep) + - (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "") - ); + looks_like_json = false; + } else { + std::string_view rest_trim = rest_sv.substr(pos); + char c = rest_trim[0]; + looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-'); + if (!looks_like_json) { + if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim); + else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim); + else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim); + } } } - // Test if arg_val is a partial JSON + // Test if arg_val is a partial JSON (only when remainder looks like JSON; else plain-text path below) std::optional value_json = std::nullopt; - if (!form.raw_argval || !*form.raw_argval) { + if ((!form.raw_argval || !*form.raw_argval) && looks_like_json) { try { value_json = builder.try_consume_json(); } catch (const std::runtime_error&) { builder.move_to(val_start); } // TODO: Delete this when json_partial adds top-level support for null/true/false diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ceafcac179..a555a76442 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2747,7 +2747,17 @@ private: slot.i_batch = -1; - common_sampler_accept(slot.smpl.get(), id, true); + try { + common_sampler_accept(slot.smpl.get(), id, true); + } catch (const std::runtime_error & e) { + // Grammar constraint violation (e.g. "Unexpected empty grammar stack") - return 500 instead of aborting + SRV_ERR("slot %d: grammar error, releasing slot: %s\n", slot.id, e.what()); + send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER); + slot.print_timings(); + metrics.on_prediction(slot); + slot.release(); + continue; + } // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement const int64_t t_current = ggml_time_us(); @@ -2791,7 +2801,17 @@ private: const size_t n_draft = slot.drafted.size(); // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted); + std::vector ids; + try { + ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted); + } catch (const std::runtime_error & e) { + SRV_ERR("slot %d: grammar error during speculative decoding, releasing slot: %s\n", slot.id, e.what()); + send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER); + slot.print_timings(); + metrics.on_prediction(slot); + slot.release(); + continue; + } slot.i_batch_dft.clear(); slot.drafted.clear();