diff --git a/common/common.h b/common/common.h
index b320d891f5..be34bcb78c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -189,8 +189,8 @@ struct common_params_sampling {
     std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
 
     // Backend sampling flags
-    bool    backend_sampling        = false; // enable backend sampling
-    bool    backend_dist            = false; // backend performs final sampling (dist)
+    bool backend_sampling = false; // enable backend sampling
+    bool backend_dist     = false; // backend performs final sampling (dist)
 
     // print the parameters into a string
     std::string print() const;
@@ -517,8 +517,8 @@ struct common_params {
         return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
     }
 
-    struct llama_sampler_seq_config * backend_samplers = NULL;
-    size_t                            n_backend_samplers = 0;
+    llama_sampler_seq_config * backend_samplers   = NULL;
+    size_t                     n_backend_samplers = 0;
 };
 
 // call once at the start of a program if it uses libcommon
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 1fc5c7ce0a..ec61c18832 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -113,9 +113,9 @@ struct common_sampler {
     llama_token_data_array cur_p;
 
     void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs        = llama_get_backend_sampled_probs_ith(ctx, idx);
-        const float *       sampled_logits       = llama_get_backend_sampled_logits_ith(ctx, idx);
-        const llama_token * sampled_ids          = llama_get_backend_sampled_token_ids_ith(ctx, idx);
+        const float *       sampled_probs  = llama_get_backend_sampled_probs_ith    (ctx, idx);
+        const float *       sampled_logits = llama_get_backend_sampled_logits_ith   (ctx, idx);
+        const llama_token * sampled_ids    = llama_get_backend_sampled_token_ids_ith(ctx, idx);
 
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
@@ -143,11 +143,11 @@ struct common_sampler {
             cur.reserve(sampled_logits_count);
             // The backend sampler has filtered the logits so we need to use the sampled ids.
             if (sampled_ids != nullptr) {
-                for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
+                for (uint32_t i = 0; i < sampled_logits_count; i++) {
                     cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
                 }
             } else {
-                for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
+                for (llama_token token_id = 0; token_id < (int) sampled_logits_count; token_id++) {
                     cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
                 }
             }
@@ -414,10 +414,12 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
     // Check if a backend sampler has already sampled a token in which case we
     // return that token id directly.
-    const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx);
-    if (backend_sampled_token != LLAMA_TOKEN_NULL) {
-        LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token);
-        return backend_sampled_token;
+    {
+        const llama_token id = llama_get_backend_sampled_token_ith(ctx, idx);
+        if (id != LLAMA_TOKEN_NULL) {
+            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
+            return id;
+        }
     }
 
     gsmpl->set_logits(ctx, idx);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 877116cbfe..f931881c9c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1233,9 +1233,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
     };
 
     int64_t n_outputs_prev = 0;
+
     // This flag indicates whether a backend sampler has actually sampled a specific
-    // token, or if it has produced probabilites. If true, we true we can skip
-    // the normal copying of logits and embeddings.
+    // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
     bool backend_has_sampled = false;
 
     do {
@@ -1655,6 +1655,7 @@ llm_graph_params llama_context::graph_params(
         /*.gtype       =*/ gtype,
         /*.sched       =*/ sched.get(),
         /*.backend_cpu =*/ backend_cpu,
+        /*.dev_out     =*/ model.dev_output(),
         /*.cvec        =*/ &cvec,
         /*.loras       =*/ &loras,
         /*.mctx        =*/ mctx,
@@ -2712,8 +2713,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
     return ctx->get_embeddings_seq(seq_id);
 }
 
-void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * sampler) {
-    ctx->set_backend_sampler(seq_id, sampler);
+void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
+    ctx->set_backend_sampler(seq_id, smpl);
 }
 
 llama_token llama_get_backend_sampled_token_ith(llama_context * ctx, int32_t i) {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 49aab37f33..561e629869 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -3,7 +3,6 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-model.h"
 
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
@@ -610,6 +609,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     rope_type        (hparams.rope_type),
     sched            (params.sched),
     backend_cpu      (params.backend_cpu),
+    dev_out          (params.dev_out),
     cvec             (params.cvec),
     loras            (params.loras),
     mctx             (params.mctx),
@@ -2049,8 +2049,7 @@ void llm_graph_context::build_pooling(
     ggml_build_forward_expand(gf, cur);
 }
 
-void llm_graph_context::build_sampling(const llama_model & model, const llm_graph_params & params) const {
-    GGML_UNUSED(params);
+void llm_graph_context::build_sampling() const {
     if (samplers.empty()) {
         return;
     }
@@ -2074,11 +2073,9 @@ void llm_graph_context::build_sampling(const llama_model & model, const llm_grap
     ggml_tensor * logits_t = res->t_logits;
     GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
 
-    const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(&model));
-    GGML_ASSERT(logits_t->ne[0] == n_vocab);
+    const int64_t n_vocab = logits_t->ne[0];
 
-    ggml_backend_dev_t device = model.dev_output();
-    ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(device);
+    ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev_out);
 
     std::unordered_map<llama_seq_id, llama_sampler*> active_samplers;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index bd176e5d38..552c3e724f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -436,6 +436,7 @@ struct llm_graph_params {
 
     ggml_backend_sched_t sched;
     ggml_backend_t backend_cpu;
+    ggml_backend_dev_t dev_out;
 
     const llama_adapter_cvec     * cvec;
     const llama_adapter_loras    * loras;
@@ -445,8 +446,8 @@ struct llm_graph_params {
     std::unordered_map<llama_seq_id, llama_sampler*> samplers;
 
     static bool samplers_equal(
-          const std::unordered_map<llama_seq_id, llama_sampler*> & lhs,
-          const std::unordered_map<llama_seq_id, llama_sampler*> & rhs) {
+          const std::unordered_map<llama_seq_id, llama_sampler *> & lhs,
+          const std::unordered_map<llama_seq_id, llama_sampler *> & rhs) {
         if (lhs.size() != rhs.size()) {
             return false;
         }
@@ -624,6 +625,8 @@ struct llm_graph_context {
 
     ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
 
+    ggml_backend_dev_t dev_out;
+
     const llama_adapter_cvec     * cvec;
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
@@ -875,7 +878,7 @@ struct llm_graph_context {
     // sampling (backend sampling)
     //
 
-    void build_sampling(const llama_model & model, const llm_graph_params & params) const;
+    void build_sampling() const;
 
     //
     // dense (out)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ca75ce4c9e..1647b85453 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7413,7 +7413,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
     llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
 
     // add backend sampling layers (if any)
-    llm->build_sampling(*this, params);
+    llm->build_sampling();
 
     // if the gguf model was converted with --sentence-transformers-dense-modules
     // there will be two additional dense projection layers