diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 79d1d633d1..4639be985c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1903,6 +1903,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); + // set one output token per sequence in order to activate all backend samplers + std::vector seq_ids(n_seqs); + for (uint32_t i = 0; i < n_seqs; ++i) { + seq_ids[i] = i; + ubatch.n_seq_id[i] = 1; + ubatch.seq_id[i] = &seq_ids[i]; + ubatch.output[i] = true; + } + auto * res = gf_res_reserve.get(); const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f396feeded..0abdab1637 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2063,18 +2063,18 @@ void llm_graph_context::build_sampling() const { logit_row_idx++; } } + if (seq_to_logit_row.empty()) { return; } - // res->t_logits will contain logits for all tokens that specied that want - // logits calculated (logits=1 or output=1) + // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1) ggml_tensor * logits_t = res->t_logits; GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor"); const int64_t n_vocab = logits_t->ne[0]; - std::unordered_map active_samplers; + std::unordered_map active_samplers; for (const auto & [seq_id, sampler] : samplers) { // Only process samplers for sequences that are in the current batch