diff --git a/common/common.h b/common/common.h index b320d891f5..be34bcb78c 100644 --- a/common/common.h +++ b/common/common.h @@ -189,8 +189,8 @@ struct common_params_sampling { std::vector logit_bias_eog; // pre-calculated logit biases for EOG tokens // Backend sampling flags - bool backend_sampling = false; // enable backend sampling - bool backend_dist = false; // backend performs final sampling (dist) + bool backend_sampling = false; // enable backend sampling + bool backend_dist = false; // backend performs final sampling (dist) // print the parameters into a string std::string print() const; @@ -517,8 +517,8 @@ struct common_params { return !speculative.model.path.empty() || !speculative.model.hf_repo.empty(); } - struct llama_sampler_seq_config * backend_samplers = NULL; - size_t n_backend_samplers = 0; + llama_sampler_seq_config * backend_samplers = NULL; + size_t n_backend_samplers = 0; }; // call once at the start of a program if it uses libcommon diff --git a/common/sampling.cpp b/common/sampling.cpp index 1fc5c7ce0a..ec61c18832 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -113,9 +113,9 @@ struct common_sampler { llama_token_data_array cur_p; void set_logits(struct llama_context * ctx, int idx) { - const float * sampled_probs = llama_get_backend_sampled_probs_ith(ctx, idx); - const float * sampled_logits = llama_get_backend_sampled_logits_ith(ctx, idx); - const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx); + const float * sampled_probs = llama_get_backend_sampled_probs_ith (ctx, idx); + const float * sampled_logits = llama_get_backend_sampled_logits_ith (ctx, idx); + const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx); const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -143,11 +143,11 @@ struct common_sampler { cur.reserve(sampled_logits_count); // The backend sampler has filtered the logits so we need to use the sampled ids. if (sampled_ids != nullptr) { - for (llama_token i = 0; i < (int)sampled_logits_count; i++) { + for (uint32_t i = 0; i < sampled_logits_count; i++) { cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f}); } } else { - for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) { + for (llama_token token_id = 0; token_id < (int) sampled_logits_count; token_id++) { cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f}); } } @@ -414,10 +414,12 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { // Check if a backend sampler has already sampled a token in which case we // return that token id directly. - const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx); - if (backend_sampled_token != LLAMA_TOKEN_NULL) { - LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token); - return backend_sampled_token; + { + const llama_token id = llama_get_backend_sampled_token_ith(ctx, idx); + if (id != LLAMA_TOKEN_NULL) { + LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id); + return id; + } } gsmpl->set_logits(ctx, idx); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 877116cbfe..f931881c9c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1233,9 +1233,9 @@ int llama_context::decode(const llama_batch & batch_inp) { }; int64_t n_outputs_prev = 0; + // This flag indicates whether a backend sampler has actually sampled a specific - // token, or if it has produced probabilites. If true, we true we can skip - // the normal copying of logits and embeddings. + // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings. bool backend_has_sampled = false; do { @@ -1655,6 +1655,7 @@ llm_graph_params llama_context::graph_params( /*.gtype =*/ gtype, /*.sched =*/ sched.get(), /*.backend_cpu =*/ backend_cpu, + /*.dev_out =*/ model.dev_output(), /*.cvec =*/ &cvec, /*.loras =*/ &loras, /*.mctx =*/ mctx, @@ -2712,8 +2713,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } -void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * sampler) { - ctx->set_backend_sampler(seq_id, sampler); +void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { + ctx->set_backend_sampler(seq_id, smpl); } llama_token llama_get_backend_sampled_token_ith(llama_context * ctx, int32_t i) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 49aab37f33..561e629869 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -3,7 +3,6 @@ #include "llama-impl.h" #include "llama-batch.h" #include "llama-cparams.h" -#include "llama-model.h" #include "llama-kv-cache.h" #include "llama-kv-cache-iswa.h" @@ -610,6 +609,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : rope_type (hparams.rope_type), sched (params.sched), backend_cpu (params.backend_cpu), + dev_out (params.dev_out), cvec (params.cvec), loras (params.loras), mctx (params.mctx), @@ -2049,8 +2049,7 @@ void llm_graph_context::build_pooling( ggml_build_forward_expand(gf, cur); } -void llm_graph_context::build_sampling(const llama_model & model, const llm_graph_params & params) const { - GGML_UNUSED(params); +void llm_graph_context::build_sampling() const { if (samplers.empty()) { return; } @@ -2074,11 +2073,9 @@ void llm_graph_context::build_sampling(const llama_model & model, const llm_grap ggml_tensor * logits_t = res->t_logits; GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor"); - const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(&model)); - GGML_ASSERT(logits_t->ne[0] == n_vocab); + const int64_t n_vocab = logits_t->ne[0]; - ggml_backend_dev_t device = model.dev_output(); - ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(device); + ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev_out); std::unordered_map active_samplers; diff --git a/src/llama-graph.h b/src/llama-graph.h index bd176e5d38..552c3e724f 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -436,6 +436,7 @@ struct llm_graph_params { ggml_backend_sched_t sched; ggml_backend_t backend_cpu; + ggml_backend_dev_t dev_out; const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; @@ -445,8 +446,8 @@ struct llm_graph_params { std::unordered_map samplers; static bool samplers_equal( - const std::unordered_map & lhs, - const std::unordered_map & rhs) { + const std::unordered_map & lhs, + const std::unordered_map & rhs) { if (lhs.size() != rhs.size()) { return false; } @@ -624,6 +625,8 @@ struct llm_graph_context { ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? + ggml_backend_dev_t dev_out; + const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; const llama_memory_context_i * mctx; @@ -875,7 +878,7 @@ struct llm_graph_context { // sampling (backend sampling) // - void build_sampling(const llama_model & model, const llm_graph_params & params) const; + void build_sampling() const; // // dense (out) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ca75ce4c9e..1647b85453 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7413,7 +7413,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm->build_pooling(cls, cls_b, cls_out, cls_out_b); // add backend sampling layers (if any) - llm->build_sampling(*this, params); + llm->build_sampling(); // if the gguf model was converted with --sentence-transformers-dense-modules // there will be two additional dense projection layers