fix: plain-text arg_value fallthrough + grammar error handling in server
- chat-parser-xml-toolcall: when remainder does not look like JSON start (e.g. 'explore'), skip try_consume_json and fall through to plain-text path instead of throwing; fixes Task tool (subagent_type=explore) never completing - server-context: catch std::runtime_error from common_sampler_accept and common_sampler_sample_and_accept_n (e.g. 'Unexpected empty grammar stack'); return 500 and release slot instead of aborting Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
329a0015a8
commit
4343ae3d65
|
|
@ -498,7 +498,9 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct
|
|||
}
|
||||
auto val_start = builder.pos();
|
||||
|
||||
// vLLM-style: only try to parse value when there is content; empty/whitespace or non-JSON start = incomplete (avoids "parse empty input" log)
|
||||
// vLLM-style: only try to parse value when there is content; empty/whitespace = incomplete (avoids "parse empty input" log).
|
||||
// When remainder does not look like JSON start, skip try_consume_json and fall through to plain-text path (e.g. "explore").
|
||||
bool looks_like_json = true;
|
||||
{
|
||||
const auto & inp = builder.input();
|
||||
const size_t rem_len = (val_start < inp.size()) ? (inp.size() - val_start) : 0;
|
||||
|
|
@ -511,38 +513,27 @@ inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct
|
|||
(form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
|
||||
);
|
||||
}
|
||||
// Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log)
|
||||
// Only call try_consume_json when remainder looks like start of a JSON value (avoids SAX error-at-position-0 → "empty input" log).
|
||||
// Otherwise fall through to plain-text path (e.g. subagent_type=explore).
|
||||
size_t pos = 0;
|
||||
while (pos < rest_sv.size() && std::isspace(static_cast<unsigned char>(rest_sv[pos]))) { ++pos; }
|
||||
if (pos >= rest_sv.size()) {
|
||||
gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; });
|
||||
throw common_chat_msg_partial_exception(
|
||||
"Expected " + gbnf_format_literal(form.val_end) +
|
||||
" after " + gbnf_format_literal(form.key_val_sep) +
|
||||
(form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
|
||||
);
|
||||
}
|
||||
std::string_view rest_trim = rest_sv.substr(pos);
|
||||
char c = rest_trim[0];
|
||||
bool looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-');
|
||||
if (!looks_like_json) {
|
||||
if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim);
|
||||
else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim);
|
||||
else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim);
|
||||
}
|
||||
if (!looks_like_json) {
|
||||
gen_partial_args([&](auto & rest, auto & needle) { arguments[key] = (form.trim_raw_argval ? string_strip(rest) : rest) + needle; });
|
||||
throw common_chat_msg_partial_exception(
|
||||
"Expected " + gbnf_format_literal(form.val_end) +
|
||||
" after " + gbnf_format_literal(form.key_val_sep) +
|
||||
(form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
|
||||
);
|
||||
looks_like_json = false;
|
||||
} else {
|
||||
std::string_view rest_trim = rest_sv.substr(pos);
|
||||
char c = rest_trim[0];
|
||||
looks_like_json = (c == '"' || c == '{' || c == '[' || (c >= '0' && c <= '9') || c == '-');
|
||||
if (!looks_like_json) {
|
||||
if (c == 't') looks_like_json = (rest_trim.size() <= 4 && std::string_view("true").substr(0, rest_trim.size()) == rest_trim);
|
||||
else if (c == 'f') looks_like_json = (rest_trim.size() <= 5 && std::string_view("false").substr(0, rest_trim.size()) == rest_trim);
|
||||
else if (c == 'n') looks_like_json = (rest_trim.size() <= 4 && std::string_view("null").substr(0, rest_trim.size()) == rest_trim);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test if arg_val is a partial JSON
|
||||
// Test if arg_val is a partial JSON (only when remainder looks like JSON; else plain-text path below)
|
||||
std::optional<common_json> value_json = std::nullopt;
|
||||
if (!form.raw_argval || !*form.raw_argval) {
|
||||
if ((!form.raw_argval || !*form.raw_argval) && looks_like_json) {
|
||||
try { value_json = builder.try_consume_json(); }
|
||||
catch (const std::runtime_error&) { builder.move_to(val_start); }
|
||||
// TODO: Delete this when json_partial adds top-level support for null/true/false
|
||||
|
|
|
|||
|
|
@ -2747,7 +2747,17 @@ private:
|
|||
|
||||
slot.i_batch = -1;
|
||||
|
||||
common_sampler_accept(slot.smpl.get(), id, true);
|
||||
try {
|
||||
common_sampler_accept(slot.smpl.get(), id, true);
|
||||
} catch (const std::runtime_error & e) {
|
||||
// Grammar constraint violation (e.g. "Unexpected empty grammar stack") - return 500 instead of aborting
|
||||
SRV_ERR("slot %d: grammar error, releasing slot: %s\n", slot.id, e.what());
|
||||
send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
|
||||
slot.print_timings();
|
||||
metrics.on_prediction(slot);
|
||||
slot.release();
|
||||
continue;
|
||||
}
|
||||
|
||||
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
|
@ -2791,7 +2801,17 @@ private:
|
|||
const size_t n_draft = slot.drafted.size();
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
|
||||
std::vector<llama_token> ids;
|
||||
try {
|
||||
ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
|
||||
} catch (const std::runtime_error & e) {
|
||||
SRV_ERR("slot %d: grammar error during speculative decoding, releasing slot: %s\n", slot.id, e.what());
|
||||
send_error(slot, std::string("Grammar constraint violation: ") + e.what(), ERROR_TYPE_SERVER);
|
||||
slot.print_timings();
|
||||
metrics.on_prediction(slot);
|
||||
slot.release();
|
||||
continue;
|
||||
}
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue