server: save generated text for the /slots endpoint (for LLAMA_SERVER_SLOTS_DEBUG=1) (#19622)
* save generated text for the /slots endpoint * update debug_generated_text only when LLAMA_SERVER_SLOTS_DEBUG > 0 * Apply suggestions from code review --------- Co-authored-by: Matteo <matteo@matteo> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
eeef3cfced
commit
b55dcdef5d
|
|
@ -77,6 +77,7 @@ struct server_slot {
|
|||
size_t last_nl_pos = 0;
|
||||
|
||||
std::string generated_text;
|
||||
std::string debug_generated_text;
|
||||
llama_tokens generated_tokens;
|
||||
|
||||
// idx of draft tokens in the main batch
|
||||
|
|
@ -425,7 +426,7 @@ struct server_slot {
|
|||
|
||||
if (!only_metrics) {
|
||||
res["prompt"] = ptask->tokens.detokenize(ctx, true);
|
||||
res["generated"] = generated_text;
|
||||
res["generated"] = generated_text.empty() ? debug_generated_text : generated_text;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1442,6 +1443,12 @@ private:
|
|||
res->id_slot = slot.id;
|
||||
|
||||
res->index = slot.task->index;
|
||||
|
||||
// keep copy of last generated text for debugging purposes
|
||||
if (slots_debug) {
|
||||
slot.debug_generated_text = slot.generated_text;
|
||||
}
|
||||
|
||||
// in stream mode, content and tokens are already in last partial chunk
|
||||
if (slot.task->params.stream) {
|
||||
res->content = "";
|
||||
|
|
|
|||
Loading…
Reference in New Issue