server: friendlier error msg when ctx < input (#18174)

* llama-server: friendlier error msg when ctx < input

This PR adds formatted strings to the server's send_error function

* llama-server: use string_format inline

* fix test
This commit is contained in:
Aman Gupta 2025-12-19 19:10:00 +08:00 committed by GitHub
parent 98c1c7a7bf
commit cc0a04343e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 17 additions and 3 deletions

View File

@ -1974,19 +1974,33 @@ struct server_context_impl {
if (!slot.can_split()) {
if (slot.task->n_tokens() > n_ubatch) {
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
send_error(slot,
string_format(
"input (%d tokens) is too large to process. increase the physical batch "
"size (current batch size: %d)",
slot.task->n_tokens(), n_ubatch),
ERROR_TYPE_SERVER);
slot.release();
continue;
}
if (slot.task->n_tokens() > slot.n_ctx) {
send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
send_error(
slot,
string_format(
"input (%d tokens) is larger than the max context size (%d tokens). skipping",
slot.task->n_tokens(), slot.n_ctx),
ERROR_TYPE_EXCEED_CONTEXT_SIZE);
slot.release();
continue;
}
} else {
if (slot.task->n_tokens() >= slot.n_ctx) {
send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
send_error(slot,
string_format("request (%d tokens) exceeds the available context size (%d "
"tokens), try increasing it",
slot.task->n_tokens(), slot.n_ctx),
ERROR_TYPE_EXCEED_CONTEXT_SIZE);
slot.release();
continue;
}