Merge 5c92c76e9e into 75f3bc94e6
This commit is contained in:
commit
e7c630f821
|
|
@ -380,6 +380,7 @@ extern "C" {
|
||||||
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
||||||
struct llama_sampler_seq_config * samplers;
|
struct llama_sampler_seq_config * samplers;
|
||||||
size_t n_samplers;
|
size_t n_samplers;
|
||||||
|
uint32_t n_sampling_outputs_max;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model_tensor_override {
|
struct llama_model_tensor_override {
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,7 @@ llama_context::llama_context(
|
||||||
cparams.cb_eval = params.cb_eval;
|
cparams.cb_eval = params.cb_eval;
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
|
|
||||||
|
cparams.n_sampling_outputs_max = params.n_sampling_outputs_max;
|
||||||
// Initialize backend samplers here so they are part of the sampling graph
|
// Initialize backend samplers here so they are part of the sampling graph
|
||||||
// before the reserve passes run later in this function. This avoids a later
|
// before the reserve passes run later in this function. This avoids a later
|
||||||
// re-reserve when graph nodes change.
|
// re-reserve when graph nodes change.
|
||||||
|
|
@ -1408,8 +1409,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
|
static std::map<llama_seq_id, std::vector<uint32_t>> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
|
||||||
std::map<llama_seq_id, uint32_t> seq_to_row;
|
std::map<llama_seq_id, std::vector<uint32_t>> seq_to_row;
|
||||||
// how many output tokens we have seen so far for this ubatch.
|
// how many output tokens we have seen so far for this ubatch.
|
||||||
uint32_t local = 0;
|
uint32_t local = 0;
|
||||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||||
|
|
@ -1420,96 +1421,111 @@ static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubat
|
||||||
|
|
||||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||||
// row_offset is the number of output tokens before this ubatch.
|
// row_offset is the number of output tokens before this ubatch.
|
||||||
seq_to_row[seq_id] = row_offset + local;
|
seq_to_row[seq_id].push_back(row_offset + local);
|
||||||
++local;
|
++local;
|
||||||
}
|
}
|
||||||
return seq_to_row;
|
return seq_to_row;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_tensor_async_ints(
|
static void copy_tensor_async_ints(
|
||||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
|
||||||
const buffer_view<llama_token> & sampled,
|
const buffer_view<llama_token> & sampled,
|
||||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
|
||||||
ggml_backend_sched_t sched) {
|
ggml_backend_sched_t sched) {
|
||||||
if (!sampled.has_data()) {
|
if (!sampled.has_data()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
for (const auto & [seq_id, tensors] : tensor_map) {
|
||||||
auto it = seq_to_row.find(seq_id);
|
auto it = seq_to_row.find(seq_id);
|
||||||
if (it == seq_to_row.end()) {
|
if (it == seq_to_row.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t row = it->second;
|
const std::vector<uint32_t> & rows = it->second;
|
||||||
GGML_ASSERT(row < sampled.size);
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
|
const uint32_t row = rows[i];
|
||||||
|
ggml_tensor * tensor = tensors[i];
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
GGML_ASSERT(row < sampled.size);
|
||||||
ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
|
||||||
|
|
||||||
|
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||||
|
ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_tensor_async_floats(
|
static void copy_tensor_async_floats(
|
||||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
|
||||||
const buffer_view<float> & dst,
|
const buffer_view<float> & dst,
|
||||||
size_t stride,
|
size_t stride,
|
||||||
std::vector<uint32_t> & counts,
|
std::vector<uint32_t> & counts,
|
||||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
|
||||||
ggml_backend_sched_t sched) {
|
ggml_backend_sched_t sched) {
|
||||||
if (!dst.has_data()) {
|
if (!dst.has_data()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
for (const auto & [seq_id, tensors] : tensor_map) {
|
||||||
auto it = seq_to_row.find(seq_id);
|
auto it = seq_to_row.find(seq_id);
|
||||||
if (it == seq_to_row.end()) {
|
if (it == seq_to_row.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t row = it->second;
|
const std::vector<uint32_t> & rows = it->second;
|
||||||
GGML_ASSERT(row < counts.size());
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
|
const uint32_t row = rows[i];
|
||||||
|
ggml_tensor * tensor = tensors[i];
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
GGML_ASSERT(row < counts.size());
|
||||||
float * row_ptr = dst.data + (size_t) row * stride;
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
|
||||||
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
|
||||||
|
|
||||||
// Update the actual number of logits/probabilities that were written for this row.
|
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||||
counts[row] = ggml_nelements(tensor);
|
float * row_ptr = dst.data + (size_t) row * stride;
|
||||||
|
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
// Update the actual number of logits/probabilities that were written for this row.
|
||||||
|
counts[row] = ggml_nelements(tensor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_tensor_async_candidates(
|
static void copy_tensor_async_candidates(
|
||||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
const std::map<llama_seq_id, std::vector<ggml_tensor*>> & tensor_map,
|
||||||
const buffer_view<llama_token> & dst,
|
const buffer_view<llama_token> & dst,
|
||||||
size_t stride,
|
size_t stride,
|
||||||
std::vector<uint32_t> & counts,
|
std::vector<uint32_t> & counts,
|
||||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
const std::map<llama_seq_id, std::vector<uint32_t>> & seq_to_row,
|
||||||
ggml_backend_sched_t sched) {
|
ggml_backend_sched_t sched) {
|
||||||
if (!dst.has_data()) {
|
if (!dst.has_data()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
for (const auto & [seq_id, tensors] : tensor_map) {
|
||||||
auto it = seq_to_row.find(seq_id);
|
auto it = seq_to_row.find(seq_id);
|
||||||
if (it == seq_to_row.end()) {
|
if (it == seq_to_row.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t row = it->second;
|
const std::vector<uint32_t> & rows = it->second;
|
||||||
GGML_ASSERT(row < counts.size());
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
|
const uint32_t row = rows[i];
|
||||||
|
ggml_tensor * tensor = tensors[i];
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
GGML_ASSERT(row < counts.size());
|
||||||
llama_token * row_ptr = dst.data + (size_t) row * stride;
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
|
||||||
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
|
||||||
|
|
||||||
// Update the actual number of candidates that were written.
|
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||||
counts[row] = ggml_nelements(tensor);
|
llama_token * row_ptr = dst.data + (size_t) row * stride;
|
||||||
|
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
// Update the actual number of candidates that were written.
|
||||||
|
counts[row] = ggml_nelements(tensor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1555,30 +1571,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
|
|
||||||
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
|
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
|
||||||
|
|
||||||
// TODO: avoid this workaround in the future
|
|
||||||
if (has_samplers && batch_inp.logits) {
|
|
||||||
std::vector<int32_t> seq_output_count(n_seq_max, 0);
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
|
|
||||||
if (batch_inp.logits[i] == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
|
|
||||||
|
|
||||||
for (int32_t s = 0; s < ns; ++s) {
|
|
||||||
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
|
|
||||||
|
|
||||||
seq_output_count[seq_id]++;
|
|
||||||
if (seq_output_count[seq_id] > 1) {
|
|
||||||
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
|
|
||||||
__func__, seq_id, seq_output_count[seq_id]);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
|
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -2070,14 +2062,29 @@ void llama_context::output_reorder() {
|
||||||
//
|
//
|
||||||
|
|
||||||
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||||
|
uint32_t res;
|
||||||
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
|
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
|
||||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
res = std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||||
|
} else {
|
||||||
|
res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||||
|
for (const auto & lora : model.loras) {
|
||||||
|
res += lora->get_n_nodes();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
||||||
for (const auto & lora : model.loras) {
|
// Account for backend sampling with multiple outputs per sequence.
|
||||||
res += lora->get_n_nodes();
|
uint32_t sampling_nodes = 0;
|
||||||
|
if (!sampling.samplers.empty()) {
|
||||||
|
const uint32_t tensors_per_output = 50;
|
||||||
|
const uint32_t sampling_outputs = std::min<uint32_t>(n_tokens, cparams.n_sampling_outputs_max);
|
||||||
|
|
||||||
|
// Account for worst case (all sequences could have backend samplers).
|
||||||
|
const uint32_t max_samplers = cparams.n_seq_max;
|
||||||
|
|
||||||
|
sampling_nodes = tensors_per_output * sampling_outputs * max_samplers;
|
||||||
}
|
}
|
||||||
return res;
|
|
||||||
|
return res + sampling_nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
|
|
@ -2918,6 +2925,7 @@ llama_context_params llama_context_default_params() {
|
||||||
/*.kv_unified =*/ false,
|
/*.kv_unified =*/ false,
|
||||||
/*.sampler =*/ nullptr,
|
/*.sampler =*/ nullptr,
|
||||||
/*.n_sampler =*/ 0,
|
/*.n_sampler =*/ 0,
|
||||||
|
/*.n_sampling_outputs_max =*/ 32,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,8 @@ struct llama_cparams {
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
uint32_t n_sampling_outputs_max; // max outputs per sequence for backend sampling
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_user_data;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -842,24 +842,24 @@ void llm_graph_result::set_outputs() {
|
||||||
if (t_embd_pooled != nullptr) {
|
if (t_embd_pooled != nullptr) {
|
||||||
ggml_set_output(t_embd_pooled);
|
ggml_set_output(t_embd_pooled);
|
||||||
}
|
}
|
||||||
for (auto & [seq_id, t] : t_sampled) {
|
for (auto & [seq_id, tensors] : t_sampled) {
|
||||||
if (t != nullptr) {
|
for (ggml_tensor * tensor : tensors) {
|
||||||
ggml_set_output(t);
|
ggml_set_output(tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto & [seq_id, t] : t_sampled_probs) {
|
for (auto & [seq_id, tensors] : t_sampled_probs) {
|
||||||
if (t != nullptr) {
|
for (ggml_tensor * tensor : tensors) {
|
||||||
ggml_set_output(t);
|
ggml_set_output(tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto & [seq_id, t] : t_sampled_logits) {
|
for (auto & [seq_id, tensors] : t_sampled_logits) {
|
||||||
if (t != nullptr) {
|
for (ggml_tensor * tensor : tensors) {
|
||||||
ggml_set_output(t);
|
ggml_set_output(tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto & [seq_id, t] : t_candidates) {
|
for (auto & [seq_id, tensors] : t_candidates) {
|
||||||
if (t != nullptr) {
|
for (ggml_tensor * tensor : tensors) {
|
||||||
ggml_set_output(t);
|
ggml_set_output(tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2725,13 +2725,13 @@ void llm_graph_context::build_sampling() const {
|
||||||
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
|
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
|
||||||
res->add_input(std::move(inp_sampling));
|
res->add_input(std::move(inp_sampling));
|
||||||
|
|
||||||
std::map<llama_seq_id, int32_t> seq_to_logit_row;
|
std::map<llama_seq_id, std::vector<int32_t>> seq_to_logit_rows;
|
||||||
int32_t logit_row_idx = 0;
|
int32_t logit_row_idx = 0;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
||||||
if (ubatch.output[i]) {
|
if (ubatch.output[i]) {
|
||||||
llama_seq_id seq_id = ubatch.seq_id[i][0];
|
llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||||
seq_to_logit_row[seq_id] = logit_row_idx;
|
seq_to_logit_rows[seq_id].push_back(logit_row_idx);
|
||||||
logit_row_idx++;
|
logit_row_idx++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2744,48 +2744,66 @@ void llm_graph_context::build_sampling() const {
|
||||||
// this is important in order to minimize graph reallocations
|
// this is important in order to minimize graph reallocations
|
||||||
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
||||||
|
|
||||||
|
// During graph reservation, n_outputs can be very large (for example 512 for worst-case PP).
|
||||||
|
// We cap it to a user-configurable maximum since typical multi output scenarios use far fewer.
|
||||||
|
const uint32_t max_outputs = std::min<uint32_t>(n_outputs, cparams.n_sampling_outputs_max);
|
||||||
|
|
||||||
for (const auto & [seq_id, sampler] : samplers) {
|
for (const auto & [seq_id, sampler] : samplers) {
|
||||||
const auto it = seq_to_logit_row.find(seq_id);
|
const auto row_it = seq_to_logit_rows.find(seq_id);
|
||||||
|
const bool sampler_is_active = row_it != seq_to_logit_rows.end();
|
||||||
|
|
||||||
// inactive samplers always work on the first row
|
// Always build samplers for all possible outputs even if the sampler is
|
||||||
const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0;
|
// not active (the sampler's sequence id is not in the current ubatch).
|
||||||
const int i_out = it != seq_to_logit_row.end() ? 1 : 0;
|
for (uint32_t i = 0; i < max_outputs; ++i) {
|
||||||
|
const bool real_output = sampler_is_active && i < row_it->second.size();
|
||||||
|
|
||||||
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
|
const int32_t row_idx = real_output ? row_it->second[i] : 0;
|
||||||
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
|
const int i_out = real_output ? 1 : 0;
|
||||||
|
|
||||||
struct llama_sampler_data data = {
|
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
|
||||||
/*.logits =*/ logits_seq,
|
ggml_format_name(logits_seq, "logits_seq_%d_%d", seq_id, i);
|
||||||
/*.probs =*/ nullptr,
|
|
||||||
/*.sampled =*/ nullptr,
|
|
||||||
/*.candidates =*/ nullptr,
|
|
||||||
};
|
|
||||||
|
|
||||||
assert(sampler->iface->backend_apply);
|
struct llama_sampler_data data = {
|
||||||
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
|
/*.logits =*/ logits_seq,
|
||||||
|
/*.probs =*/ nullptr,
|
||||||
|
/*.sampled =*/ nullptr,
|
||||||
|
/*.candidates =*/ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
if (data.sampled != nullptr) {
|
assert(sampler->iface->backend_apply);
|
||||||
res->t_sampled[seq_id] = data.sampled;
|
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
|
||||||
outs[1] = data.sampled;
|
|
||||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data.probs != nullptr) {
|
if (data.sampled != nullptr) {
|
||||||
res->t_sampled_probs[seq_id] = data.probs;
|
if (real_output) {
|
||||||
outs[1] = data.probs;
|
res->t_sampled[seq_id].push_back(data.sampled);
|
||||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
}
|
||||||
}
|
outs[1] = data.sampled;
|
||||||
|
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||||
|
}
|
||||||
|
|
||||||
if (data.logits != nullptr) {
|
if (data.probs != nullptr) {
|
||||||
res->t_sampled_logits[seq_id] = data.logits;
|
if (real_output) {
|
||||||
outs[1] = data.logits;
|
res->t_sampled_probs[seq_id].push_back(data.probs);
|
||||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
}
|
||||||
}
|
outs[1] = data.probs;
|
||||||
|
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||||
|
}
|
||||||
|
|
||||||
if (data.candidates != nullptr) {
|
if (data.logits != nullptr) {
|
||||||
res->t_candidates[seq_id] = data.candidates;
|
if (real_output) {
|
||||||
outs[1] = data.candidates;
|
res->t_sampled_logits[seq_id].push_back(data.logits);
|
||||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
}
|
||||||
|
outs[1] = data.logits;
|
||||||
|
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.candidates != nullptr) {
|
||||||
|
if (real_output) {
|
||||||
|
res->t_candidates[seq_id].push_back(data.candidates);
|
||||||
|
}
|
||||||
|
outs[1] = data.candidates;
|
||||||
|
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -672,10 +672,10 @@ public:
|
||||||
ggml_tensor * t_embd = nullptr;
|
ggml_tensor * t_embd = nullptr;
|
||||||
ggml_tensor * t_embd_pooled = nullptr;
|
ggml_tensor * t_embd_pooled = nullptr;
|
||||||
|
|
||||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
std::map<llama_seq_id, std::vector<ggml_tensor*>> t_sampled_logits;
|
||||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
std::map<llama_seq_id, std::vector<ggml_tensor*>> t_candidates;
|
||||||
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
std::map<llama_seq_id, std::vector<ggml_tensor*>> t_sampled;
|
||||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
std::map<llama_seq_id, std::vector<ggml_tensor*>> t_sampled_probs;
|
||||||
|
|
||||||
std::vector<llm_graph_input_ptr> inputs;
|
std::vector<llm_graph_input_ptr> inputs;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1180,6 +1180,9 @@ static void llama_sampler_dist_backend_apply(
|
||||||
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
||||||
ggml_set_name(idxf, "dist_index_f32");
|
ggml_set_name(idxf, "dist_index_f32");
|
||||||
|
|
||||||
|
// Clamp to prevent out-of-bounds access when computing the index.
|
||||||
|
idxf = ggml_clamp(ctx, idxf, 1.0f, mask->ne[0]);
|
||||||
|
|
||||||
// Use ggml_scale_bias to scale the index value by -1 and then add the size
|
// Use ggml_scale_bias to scale the index value by -1 and then add the size
|
||||||
// of the mask to that value so we get the correct index ((-1 * idxf) + n).
|
// of the mask to that value so we get the correct index ((-1 * idxf) + n).
|
||||||
struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
|
struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
|
||||||
|
|
|
||||||
|
|
@ -969,7 +969,7 @@ static void test_backend_cpu_mixed_batch(const test_params & params) {
|
||||||
printf("backend-cpu mixed batch test PASSED\n");
|
printf("backend-cpu mixed batch test PASSED\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_backend_max_outputs(const test_params & params) {
|
static void test_backend_multiple_outputs(const test_params & params) {
|
||||||
const int seq_id = 0;
|
const int seq_id = 0;
|
||||||
const int32_t seed = 88;
|
const int32_t seed = 88;
|
||||||
|
|
||||||
|
|
@ -995,17 +995,32 @@ static void test_backend_max_outputs(const test_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
for (size_t i = 0; i < tokens.size(); i++) {
|
||||||
// set all tokens as output to trigger error
|
// set all tokens as output to get multiple outputs for a single sequence.
|
||||||
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(">>> test_max_outputs expected error start:\n");
|
|
||||||
const int ret = llama_decode(test_ctx.ctx.get(), batch);
|
const int ret = llama_decode(test_ctx.ctx.get(), batch);
|
||||||
GGML_ASSERT(ret != 0 && "llama_decode should not succeed multiple outputs per sequence");
|
if (ret != 0) {
|
||||||
printf("<<< test_max_outputs expected error end.\n");
|
GGML_ASSERT(false && "Failed to decode sequence with multiple outputs");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> sampled_tokens;
|
||||||
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
|
if (batch.logits[i]) {
|
||||||
|
llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), i);
|
||||||
|
const std::string token_str = test_ctx.token_to_piece(token, false);
|
||||||
|
//printf("Position %d: token id=%d, string='%s'\n", i, token, token_str.c_str());
|
||||||
|
GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
|
||||||
|
sampled_tokens.push_back(token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT((int)sampled_tokens.size() == batch.n_tokens);
|
||||||
|
printf("Sampled %zu tokens for sequence %d\n", sampled_tokens.size(), seq_id);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
printf("backend max outputs test PASSED\n");
|
printf("backend multiple outputs test PASSED\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct backend_test_case {
|
struct backend_test_case {
|
||||||
|
|
@ -1024,7 +1039,7 @@ static const backend_test_case BACKEND_TESTS[] = {
|
||||||
{ "dist", test_backend_dist_sampling, true },
|
{ "dist", test_backend_dist_sampling, true },
|
||||||
{ "dist_and_cpu", test_backend_dist_sampling_and_cpu, true },
|
{ "dist_and_cpu", test_backend_dist_sampling_and_cpu, true },
|
||||||
{ "set_sampler", test_backend_set_sampler, true },
|
{ "set_sampler", test_backend_set_sampler, true },
|
||||||
{ "max_outputs", test_backend_max_outputs, true },
|
{ "multiple_outputs",test_backend_multiple_outputs, true },
|
||||||
{ "mixed", test_backend_mixed_sampling, true },
|
{ "mixed", test_backend_mixed_sampling, true },
|
||||||
{ "min_p", test_backend_min_p_sampling, true },
|
{ "min_p", test_backend_min_p_sampling, true },
|
||||||
{ "cpu_mixed", test_backend_cpu_mixed_batch, true },
|
{ "cpu_mixed", test_backend_cpu_mixed_batch, true },
|
||||||
|
|
|
||||||
|
|
@ -1194,9 +1194,6 @@ private:
|
||||||
|
|
||||||
backend_sampling &= task.params.sampling.backend_sampling;
|
backend_sampling &= task.params.sampling.backend_sampling;
|
||||||
|
|
||||||
// TODO: speculative decoding requires multiple samples per batch - not supported yet
|
|
||||||
backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
|
|
||||||
|
|
||||||
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
|
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
|
||||||
backend_sampling &= !need_logits;
|
backend_sampling &= !need_logits;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue