diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e9f000bf2b..60b980ff6e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1331,7 +1331,6 @@ static void copy_tensor_async_ints( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1364,7 +1363,6 @@ static void copy_tensor_async_floats( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; @@ -1401,7 +1399,6 @@ static void copy_tensor_async_candidates( } const std::vector & rows = it->second; - GGML_ASSERT(tensors.size() == rows.size() && "number of tensors must match number of output rows"); for (size_t i = 0; i < tensors.size(); ++i) { const uint32_t row = rows[i]; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 63e0276c83..1baafb0d82 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2605,17 +2605,18 @@ void llm_graph_context::build_sampling() const { for (const auto & [seq_id, sampler] : samplers) { const auto row_it = seq_to_logit_rows.find(seq_id); + const bool sampler_is_active = row_it != seq_to_logit_rows.end(); - // row_it is now a sequence id to list of row ids - static const std::vector default_row = {0}; - const std::vector & logit_rows = row_it != seq_to_logit_rows.end() ? row_it->second : default_row; - for (const int32_t row_idx : logit_rows) { + // Always build samplers for all possible outputs even if the sampler is + // not active (the sampler's sequence id is not in the current ubatch). + for (uint32_t i = 0; i < max_outputs; ++i) { + const bool real_output = sampler_is_active && i < row_it->second.size(); - // inactive samplers always work on the first row - const int i_out = row_it != seq_to_logit_rows.end() ? 1 : 0; + const int32_t row_idx = real_output ? row_it->second[i] : 0; + const int i_out = real_output ? 1 : 0; ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); - ggml_format_name(logits_seq, "logits_seq_%d", seq_id); + ggml_format_name(logits_seq, "logits_seq_%d_%d", seq_id, i); struct llama_sampler_data data = { /*.logits =*/ logits_seq, @@ -2628,25 +2629,33 @@ void llm_graph_context::build_sampling() const { sampler->iface->backend_apply(sampler, ctx0, gf, &data); if (data.sampled != nullptr) { - res->t_sampled[seq_id].push_back(data.sampled); + if (real_output) { + res->t_sampled[seq_id].push_back(data.sampled); + } outs[1] = data.sampled; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.probs != nullptr) { - res->t_sampled_probs[seq_id].push_back(data.probs); + if (real_output) { + res->t_sampled_probs[seq_id].push_back(data.probs); + } outs[1] = data.probs; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.logits != nullptr) { - res->t_sampled_logits[seq_id].push_back(data.logits); + if (real_output) { + res->t_sampled_logits[seq_id].push_back(data.logits); + } outs[1] = data.logits; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.candidates != nullptr) { - res->t_candidates[seq_id].push_back(data.candidates); + if (real_output) { + res->t_candidates[seq_id].push_back(data.candidates); + } outs[1] = data.candidates; ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); }