graph : do not include llama-model.h

2025-11-18 13:53:25 +02:00 · 2025-11-18 13:53:25 +02:00 · 4b52e59903
parent 71574f9273
commit 4b52e59903
6 changed files with 31 additions and 28 deletions
--- a/common/common.h
+++ b/common/common.h
@ -189,8 +189,8 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

    // Backend sampling flags
-    bool    backend_sampling        = false; // enable backend sampling
-    bool    backend_dist            = false; // backend performs final sampling (dist)
+    bool backend_sampling = false; // enable backend sampling
+    bool backend_dist     = false; // backend performs final sampling (dist)

    // print the parameters into a string
    std::string print() const;
@ -517,8 +517,8 @@ struct common_params {
        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
    }

-    struct llama_sampler_seq_config * backend_samplers = NULL;
-    size_t                            n_backend_samplers = 0;
+    llama_sampler_seq_config * backend_samplers   = NULL;
+    size_t                     n_backend_samplers = 0;
 };

 // call once at the start of a program if it uses libcommon
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -113,9 +113,9 @@ struct common_sampler {
    llama_token_data_array cur_p;

    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs        = llama_get_backend_sampled_probs_ith(ctx, idx);
-        const float *       sampled_logits       = llama_get_backend_sampled_logits_ith(ctx, idx);
-        const llama_token * sampled_ids          = llama_get_backend_sampled_token_ids_ith(ctx, idx);
+        const float *       sampled_probs  = llama_get_backend_sampled_probs_ith    (ctx, idx);
+        const float *       sampled_logits = llama_get_backend_sampled_logits_ith   (ctx, idx);
+        const llama_token * sampled_ids    = llama_get_backend_sampled_token_ids_ith(ctx, idx);

        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);
@ -143,11 +143,11 @@ struct common_sampler {
            cur.reserve(sampled_logits_count);
            // The backend sampler has filtered the logits so we need to use the sampled ids.
            if (sampled_ids != nullptr) {
-                for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
+                for (uint32_t i = 0; i < sampled_logits_count; i++) {
                    cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
                }
            } else {
-                for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
+                for (llama_token token_id = 0; token_id < (int) sampled_logits_count; token_id++) {
                    cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
                }
            }
@ -414,10 +414,12 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
-    const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx);
-    if (backend_sampled_token != LLAMA_TOKEN_NULL) {
-        LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token);
-        return backend_sampled_token;
+    {
+        const llama_token id = llama_get_backend_sampled_token_ith(ctx, idx);
+        if (id != LLAMA_TOKEN_NULL) {
+            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
+            return id;
+        }
    }

    gsmpl->set_logits(ctx, idx);
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -1233,9 +1233,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
    };

    int64_t n_outputs_prev = 0;
+
    // This flag indicates whether a backend sampler has actually sampled a specific
-    // token, or if it has produced probabilites. If true, we true we can skip
-    // the normal copying of logits and embeddings.
+    // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
    bool backend_has_sampled = false;

    do {
@ -1655,6 +1655,7 @@ llm_graph_params llama_context::graph_params(
        /*.gtype       =*/ gtype,
        /*.sched       =*/ sched.get(),
        /*.backend_cpu =*/ backend_cpu,
+        /*.dev_out     =*/ model.dev_output(),
        /*.cvec        =*/ &cvec,
        /*.loras       =*/ &loras,
        /*.mctx        =*/ mctx,
@ -2712,8 +2713,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
    return ctx->get_embeddings_seq(seq_id);
 }

-void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * sampler) {
-    ctx->set_backend_sampler(seq_id, sampler);
+void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
+    ctx->set_backend_sampler(seq_id, smpl);
 }

 llama_token llama_get_backend_sampled_token_ith(llama_context * ctx, int32_t i) {
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -3,7 +3,6 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-model.h"

 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
@ -610,6 +609,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    rope_type        (hparams.rope_type),
    sched            (params.sched),
    backend_cpu      (params.backend_cpu),
+    dev_out          (params.dev_out),
    cvec             (params.cvec),
    loras            (params.loras),
    mctx             (params.mctx),
@ -2049,8 +2049,7 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }

-void llm_graph_context::build_sampling(const llama_model & model, const llm_graph_params & params) const {
-    GGML_UNUSED(params);
+void llm_graph_context::build_sampling() const {
    if (samplers.empty()) {
        return;
    }
@ -2074,11 +2073,9 @@ void llm_graph_context::build_sampling(const llama_model & model, const llm_grap
    ggml_tensor * logits_t = res->t_logits;
    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");

-    const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(&model));
-    GGML_ASSERT(logits_t->ne[0] == n_vocab);
+    const int64_t n_vocab = logits_t->ne[0];

-    ggml_backend_dev_t device = model.dev_output();
-    ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(device);
+    ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev_out);

    std::unordered_map<llama_seq_id, llama_sampler*> active_samplers;

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -436,6 +436,7 @@ struct llm_graph_params {

    ggml_backend_sched_t sched;
    ggml_backend_t backend_cpu;
+    ggml_backend_dev_t dev_out;

    const llama_adapter_cvec     * cvec;
    const llama_adapter_loras    * loras;
@ -445,8 +446,8 @@ struct llm_graph_params {
    std::unordered_map<llama_seq_id, llama_sampler*> samplers;

    static bool samplers_equal(
-          const std::unordered_map<llama_seq_id, llama_sampler*> & lhs,
-          const std::unordered_map<llama_seq_id, llama_sampler*> & rhs) {
+          const std::unordered_map<llama_seq_id, llama_sampler *> & lhs,
+          const std::unordered_map<llama_seq_id, llama_sampler *> & rhs) {
        if (lhs.size() != rhs.size()) {
            return false;
        }
@ -624,6 +625,8 @@ struct llm_graph_context {

    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?

+    ggml_backend_dev_t dev_out;
+
    const llama_adapter_cvec     * cvec;
    const llama_adapter_loras    * loras;
    const llama_memory_context_i * mctx;
@ -875,7 +878,7 @@ struct llm_graph_context {
    // sampling (backend sampling)
    //

-    void build_sampling(const llama_model & model, const llm_graph_params & params) const;
+    void build_sampling() const;

    //
    // dense (out)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -7413,7 +7413,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);

    // add backend sampling layers (if any)
-    llm->build_sampling(*this, params);
+    llm->build_sampling();

    // if the gguf model was converted with --sentence-transformers-dense-modules
    // there will be two additional dense projection layers