fixed mtp kv cache update sequencing after prompt processing
This commit is contained in:
parent
6870f9790c
commit
382135aa36
|
|
@ -3543,18 +3543,19 @@ struct server_context {
|
|||
|
||||
const int tok_idx = slot.i_batch - i;
|
||||
|
||||
// This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
|
||||
if (slot.has_mtp) {
|
||||
mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
|
||||
}
|
||||
|
||||
llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
|
||||
slot.last_tok_idx = tok_idx;
|
||||
SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
// This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
|
||||
if (slot.has_mtp) {
|
||||
mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
|
||||
}
|
||||
|
||||
slot.n_decoded += 1;
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
|
|
|||
Loading…
Reference in New Issue