From e856c8f95933a1896c0ebdb5cc24057dd04e5676 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Wed, 4 Feb 2026 17:43:49 +0000
Subject: [PATCH 01/25] llama : add blue noise rng implementation

---
 src/llama-sampling.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 515d6c163b..5dd094ce7a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -333,6 +333,81 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
+// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation
+struct blue_noise_rng {
+    uint8_t  bit_depth = 0;
+    uint32_t seed      = 0;
+    uint32_t position  = 0;
+
+    // binary tree of 1-bit 50% duty cycle blue noise generators
+    std::vector<std::array<int8_t, 2>> states; // {err0, err1} per tree node
+
+    blue_noise_rng() = default;
+
+    blue_noise_rng(uint8_t bit_depth, uint32_t seed) {
+        init(bit_depth, seed);
+    }
+
+    static uint32_t hash(uint32_t x) { // lowbias32
+        x ^= x >> 16; x *= 0x21f0aaad;
+        x ^= x >> 15; x *= 0x735a2d97;
+        x ^= x >> 15;
+        return x;
+    }
+
+    void init(uint8_t depth, uint32_t s) {
+        bit_depth = std::clamp<uint8_t>(depth, 1, 16);
+        seed      = hash(s);
+
+        const int n = (1 << bit_depth) - 1;
+        states.resize(n);
+
+        reset();
+    }
+
+    void reset() {
+        const int n = (int)states.size();
+        position  = 0;
+
+        // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10)
+        static const int8_t tbl[10][2] = {
+            { 0,  0}, { 0,  0}, { 0,  0},
+            {-1,  0}, {-1,  0}, {-1,  0},
+            { 0, -1}, { 0, -1},
+            {-2,  0},
+            {-1, -1},
+        };
+        for (int i = 0; i < n; i++) {
+            uint32_t h = hash((uint32_t)i ^ seed) % 10;
+            states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
+        }
+    }
+
+    uint16_t next() {
+        uint32_t h = hash(position ^ seed);
+        position++;
+
+        // traverse binary tree root-to-leaf, one error diffusion ditherer per bit
+        uint32_t acc = 0;
+        for (int level = 0; level < bit_depth; level++) {
+            auto & s = states[(1 << level) - 1 + acc]; // heap-style index
+
+            int    out = (s[0] >= 0) ? 1 : 0;
+            int8_t qe  = s[0] + (int8_t)(out ? -1 : 1); // inverse autocorrelation
+
+            s[0] = s[1]; // step forward
+            s[1] = 0;
+
+            // error diffusion dithering using binary weight perturbation
+            s[(h >> level) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2
+
+            acc = acc * 2 + out;
+        }
+
+        return (uint16_t)acc;
+    }
+};
+
 static uint32_t get_rng_seed(uint32_t seed) {
     if (seed == LLAMA_DEFAULT_SEED) {
         // use system clock if std::random_device is not a true RNG

From b2ee2fbc0a8e9fef59a8ab27d93cb99b4a23b782 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Wed, 4 Feb 2026 18:18:56 +0000
Subject: [PATCH 02/25] llama : add floating point blue noise rng

---
 src/llama-sampling.cpp | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 5dd094ce7a..7c83095582 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -333,13 +333,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
-// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation
+// pseudo-random number generator with ~6db/octave blue noise
 struct blue_noise_rng {
     uint8_t  bit_depth = 0;
     uint32_t seed      = 0;
     uint32_t position  = 0;
 
-    // binary tree of 1-bit 50% duty cycle blue noise generators
+    // binary tree of 1-bit 50% duty cycle error diffusion dithering blue noise generators
     std::vector<std::array<int8_t, 2>> states; // {err0, err1} per tree node
 
     blue_noise_rng() = default;
@@ -383,11 +383,12 @@ struct blue_noise_rng {
         }
     }
 
-    uint16_t next() {
+    uint16_t next(uint32_t * hash_remainder = nullptr) {
         uint32_t h = hash(position ^ seed);
         position++;
 
-        // traverse binary tree root-to-leaf, one error diffusion ditherer per bit
+        // traverse binary tree, one error diffusion ditherer per population split
+        // thresholding output at any value still produces blue noise
         uint32_t acc = 0;
         for (int level = 0; level < bit_depth; level++) {
             auto & s = states[(1 << level) - 1 + acc]; // heap-style index
@@ -404,8 +405,31 @@ struct blue_noise_rng {
             acc = acc * 2 + out;
         }
 
+        if (hash_remainder) {
+            *hash_remainder = h >> bit_depth; // unused bits from random hash
+        }
+
         return (uint16_t)acc;
     }
+
+    // blue noise in the upper bit_depth bits, white noise hash remainder in the lower bits
+    // do not use with modulo operator, as it would just produce white noise
+    uint32_t next32() {
+        uint32_t rem;
+        uint32_t val = next(&rem);
+        return (val << (32 - bit_depth)) | rem;
+    }
+
+    // uniform double in [0, 1) with blue noise temporal autocorrelation
+    double nextf() {
+        double res = 0.0;
+        res += hash(position ^ ~seed); // fill low bits with white noise
+        res *= 1.0 / 4294967296.0;
+        res += next32();
+        res *= 1.0 / 4294967296.0;
+        if (res >= 1.0) res = std::nextafter(1.0, 0.0);
+        return res;
+    }
 };
 
 static uint32_t get_rng_seed(uint32_t seed) {

From f271576d81ca920d5d35a76f44a663da47608adb Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Wed, 4 Feb 2026 19:12:50 +0000
Subject: [PATCH 03/25] llama : initial blue noise test implementation

---
 common/arg.cpp               |   7 ++
 common/common.h              |   1 +
 common/sampling.cpp          |   6 +-
 include/llama.h              |   3 +-
 src/llama-sampling.cpp       | 197 ++++++++++++++++++++++++++++++++++-
 tools/server/server-task.cpp |   3 +
 6 files changed, 214 insertions(+), 3 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 5fbc9022c0..924b5198a2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1577,6 +1577,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.ignore_eos = true;
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--blue-noise"},
+        "use blue noise RNG for sampling instead of white noise",
+        [](common_params & params) {
+            params.sampling.blue_noise = true;
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
         string_format("temperature (default: %.2f)", (double)params.sampling.temp),
diff --git a/common/common.h b/common/common.h
index 398ebb0960..0a76a1e26c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -209,6 +209,7 @@ struct common_params_sampling {
     bool    ignore_eos         = false;
     bool    no_perf            = false;  // disable performance metrics
     bool    timing_per_token   = false;
+    bool    blue_noise         = false;  // use blue noise RNG instead of white noise for dist sampler
 
     uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 11a1d48398..2811eb3a48 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -313,7 +313,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
         } else {
             // default: sample from distribution
-            samplers.push_back(llama_sampler_init_dist(params.seed));
+            if (params.blue_noise) {
+                samplers.push_back(llama_sampler_init_dist_blue_noise(params.seed));
+            } else {
+                samplers.push_back(llama_sampler_init_dist(params.seed));
+            }
         }
     } else if (params.mirostat == 1) {
         samplers.push_back(llama_sampler_init_temp(params.temp));
diff --git a/include/llama.h b/include/llama.h
index bf4e28a8be..22f08e1683 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1295,7 +1295,8 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 
     /// seed == LLAMA_DEFAULT_SEED to use a random seed.
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist           (uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// Setting k <= 0 makes this a noop
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 7c83095582..09fd3a4700 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -369,7 +369,7 @@ struct blue_noise_rng {
         const int n = (int)states.size();
         position  = 0;
 
-        // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10)
+        // 5 reachable states with distribution 3:3:2:1:1
         static const int8_t tbl[10][2] = {
             { 0,  0}, { 0,  0}, { 0,  0},
             {-1,  0}, {-1,  0}, {-1,  0},
@@ -1340,6 +1340,197 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
     );
 }
 
+// dist (blue noise)
+
+struct llama_sampler_dist_blue_noise : public llama_sampler_backend {
+    const uint32_t seed;
+          uint32_t seed_cur;
+
+    blue_noise_rng bn_rng;
+
+    ggml_tensor * inp_uniform;
+};
+
+static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) {
+    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+    return sctx->get_name();
+}
+
+static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+
+    // edge cases
+    if (cur_p->size == 0) {
+        cur_p->selected = -1;
+        return;
+    }
+
+    cur_p->selected = 0;
+
+    if (cur_p->size == 1) {
+        cur_p->data[0].p = 1.0f;
+        return;
+    }
+
+    // max logit for numerical stability
+    float max_l = cur_p->data[0].logit;
+    if (!cur_p->sorted) {
+        for (size_t i = 1; i < cur_p->size; ++i) {
+            max_l = std::max(max_l, cur_p->data[i].logit);
+        }
+    }
+
+    // apply softmax to obtain the probabilities
+    double sum_cum = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float p = expf(cur_p->data[i].logit - max_l);
+        cur_p->data[i].p = p;
+        sum_cum += p;
+    }
+
+    // sample using blue noise RNG
+    const double rnd = ctx->bn_rng.nextf();
+
+          double sum_run = 0.0f;
+    const double sum_tgt = sum_cum*rnd;
+
+    bool found = false;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (!found) {
+            sum_run += cur_p->data[i].p;
+            if (sum_run >= sum_tgt) {
+                cur_p->selected = i;
+                found = true;
+            }
+        }
+
+        // normalize probs
+        cur_p->data[i].p /= sum_cum;
+    }
+
+    assert(found);
+    if (!found) {
+        cur_p->selected = cur_p->size - 1;
+    }
+}
+
+static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->bn_rng.init(16, ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx;
+    auto * result = llama_sampler_init_dist_blue_noise(ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx;
+
+        result_ctx->seed_cur = ctx->seed_cur;
+        result_ctx->bn_rng   = ctx->bn_rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_dist_blue_noise *) smpl->ctx;
+}
+
+static bool llama_sampler_dist_blue_noise_backend_init(
+        struct llama_sampler       * smpl,
+        ggml_backend_buffer_type_t   buft) {
+    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+
+    const bool res = llama_sampler_backend_support(smpl, buft);
+
+    sctx->init(res);
+
+    return res;
+}
+
+static void llama_sampler_dist_blue_noise_backend_apply(
+        struct llama_sampler      * smpl,
+        struct ggml_context       * ctx,
+        struct ggml_cgraph        * gf,
+        struct llama_sampler_data * data) {
+    GGML_UNUSED(gf);
+
+    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+
+    sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    ggml_set_name (sctx->inp_uniform, "uniform");
+    ggml_set_input(sctx->inp_uniform);
+
+    struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
+    ggml_set_name(probs, "dist_probs");
+
+    struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs);
+    ggml_set_name(cumsum, "dist_cumsum");
+
+    struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform);
+    ggml_set_name(diff, "dist_cumsum");
+
+    struct ggml_tensor * mask = ggml_step(ctx, diff);
+    ggml_set_name(mask, "dist_mask");
+
+    struct ggml_tensor * idxf = ggml_sum(ctx, mask);
+    ggml_set_name(idxf, "dist_index_f32");
+
+    struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
+    ggml_set_name(idx, "dist_index_i32");
+
+    struct ggml_tensor * sampled_token = idx;
+    if (data->candidates != nullptr) {
+        struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates));
+
+        sampled_token = ggml_get_rows(ctx, candidates, idx);
+        ggml_set_name(sampled_token, "dist_sampled_token");
+    }
+
+    data->sampled = sampled_token;
+    data->probs = probs;
+}
+
+static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) {
+    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
+
+    GGML_ASSERT(sctx->inp_uniform != nullptr);
+
+    const float rnd = (float)sctx->bn_rng.nextf();
+
+    ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
+}
+
+static struct llama_sampler_i llama_sampler_dist_blue_noise_i = {
+    /* .name              = */ llama_sampler_dist_blue_noise_name,
+    /* .accept            = */ nullptr,
+    /* .apply             = */ llama_sampler_dist_blue_noise_apply,
+    /* .reset             = */ llama_sampler_dist_blue_noise_reset,
+    /* .clone             = */ llama_sampler_dist_blue_noise_clone,
+    /* .free              = */ llama_sampler_dist_blue_noise_free,
+    /* .backend_init      = */ llama_sampler_dist_blue_noise_backend_init,
+    /* .backend_accept    = */ nullptr,
+    /* .backend_apply     = */ llama_sampler_dist_blue_noise_backend_apply,
+    /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input,
+};
+
+struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_dist_blue_noise_i,
+        /* .ctx   = */ new llama_sampler_dist_blue_noise {
+            ("dist-blue-noise"),
+            /* .seed        = */ seed,
+            /* .seed_cur    = */ seed_cur,
+            /* .bn_rng      = */ blue_noise_rng(16, seed_cur),
+            /* .inp_uniform = */ nullptr,
+        }
+    );
+}
+
 // top-k
 
 struct llama_sampler_top_k : public llama_sampler_backend {
@@ -3928,6 +4119,10 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
         return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
     }
 
+    if (smpl->iface == &llama_sampler_dist_blue_noise_i) {
+        return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur;
+    }
+
     if (smpl->iface == &llama_sampler_mirostat_i) {
         return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
     }
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 2d25db63b7..16c3cf12d0 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -66,6 +66,7 @@ json task_params::to_json(bool only_metrics) const {
             {"n_keep",                    n_keep},
             {"n_discard",                 n_discard},
             {"ignore_eos",                sampling.ignore_eos},
+            {"blue_noise",                sampling.blue_noise},
             {"stream",                    stream},
             {"n_probs",                   sampling.n_probs},
             {"min_keep",                  sampling.min_keep},
@@ -125,6 +126,7 @@ json task_params::to_json(bool only_metrics) const {
         {"n_keep",                    n_keep},
         {"n_discard",                 n_discard},
         {"ignore_eos",                sampling.ignore_eos},
+        {"blue_noise",                sampling.blue_noise},
         {"stream",                    stream},
         {"logit_bias",                format_logit_bias(sampling.logit_bias)},
         {"n_probs",                   sampling.n_probs},
@@ -467,6 +469,7 @@ task_params server_task::params_from_json_cmpl(
             }
         }
 
+        params.sampling.blue_noise  = json_value(data, "blue_noise",  params_base.sampling.blue_noise);
         params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
         if (params.sampling.ignore_eos) {
             params.sampling.logit_bias.insert(

From 3b4061981b92a091ed65b853add69adcc1a5a091 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Wed, 4 Feb 2026 23:05:54 +0000
Subject: [PATCH 04/25] llama : make the sampler rng modular

---
 src/llama-sampling.cpp | 285 ++++++++++++-----------------------------
 1 file changed, 81 insertions(+), 204 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 09fd3a4700..c41666aaa7 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -432,6 +432,56 @@ struct blue_noise_rng {
     }
 };
 
+// abstract RNG interface for the dist sampler
+struct llama_dist_rng {
+    virtual ~llama_dist_rng() = default;
+
+    virtual double                          nextf()            = 0; // uniform double in [0, 1)
+    virtual void                            reseed(uint32_t s) = 0;
+    virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
+};
+
+struct llama_dist_rng_white : llama_dist_rng {
+    std::mt19937 rng;
+
+    llama_dist_rng_white(uint32_t seed) : rng(seed) {}
+
+    double nextf() override {
+        std::uniform_real_distribution<double> dist(0.0, 1.0);
+        return dist(rng);
+    }
+
+    void reseed(uint32_t s) override {
+        rng.seed(s);
+    }
+
+    std::unique_ptr<llama_dist_rng> clone() const override {
+        auto c = std::make_unique<llama_dist_rng_white>(0);
+        c->rng = rng;
+        return c;
+    }
+};
+
+struct llama_dist_rng_blue : llama_dist_rng {
+    blue_noise_rng bn_rng;
+
+    llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {}
+
+    double nextf() override {
+        return bn_rng.nextf();
+    }
+
+    void reseed(uint32_t s) override {
+        bn_rng.init(16, s);
+    }
+
+    std::unique_ptr<llama_dist_rng> clone() const override {
+        auto c = std::make_unique<llama_dist_rng_blue>(0);
+        c->bn_rng = bn_rng;
+        return c;
+    }
+};
+
 static uint32_t get_rng_seed(uint32_t seed) {
     if (seed == LLAMA_DEFAULT_SEED) {
         // use system clock if std::random_device is not a true RNG
@@ -1122,7 +1172,7 @@ struct llama_sampler_dist : public llama_sampler_backend {
     const uint32_t seed;
           uint32_t seed_cur;
 
-    std::mt19937 rng;
+    std::unique_ptr<llama_dist_rng> rng;
 
     ggml_tensor * inp_uniform;
 };
@@ -1168,8 +1218,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     // sample from the obtained probabilities and normalize the probs in a single pass
     // this is ~3x faster on Mac with full gpt-oss vocab than the version below
     //
-    std::uniform_real_distribution<double> dist(0.0f, 1.0f);
-    const double rnd = dist(ctx->rng);
+    const double rnd = ctx->rng->nextf();
 
           double sum_run = 0.0f;
     const double sum_tgt = sum_cum*rnd;
@@ -1200,28 +1249,37 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
         cur_p->data[i].p /= sum_cum;
     }
 
-    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+    const double rnd = ctx->rng->nextf();
+    double cum = 0.0;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cum += cur_p->data[i].p;
+        if (cum >= rnd) {
+            cur_p->selected = i;
+            break;
+        }
+    }
 #endif
 }
 
 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
     ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->rng.seed(ctx->seed_cur);
+    ctx->rng->reseed(ctx->seed_cur);
 }
 
 static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
-    auto * result = llama_sampler_init_dist(ctx->seed);
+    auto * ctx = (llama_sampler_dist *) smpl->ctx;
 
-    // copy the state
-    {
-        auto * result_ctx = (llama_sampler_dist *) result->ctx;
-
-        result_ctx->rng = ctx->rng;
-    }
-
-    return result;
+    return llama_sampler_init(
+        /* .iface = */ smpl->iface,
+        /* .ctx   = */ new llama_sampler_dist {
+            {ctx->get_name()},
+            /* .seed        = */ ctx->seed,
+            /* .seed_cur    = */ ctx->seed_cur,
+            /* .rng         = */ ctx->rng->clone(),
+            /* .inp_uniform = */ nullptr,
+        }
+    );
 }
 
 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@@ -1307,8 +1365,8 @@ static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
     // std::uniform_real_distribution<double> and
     // std::uniform_real_distribution<float> with same rng will produce
     // different sequences).
-    std::uniform_real_distribution<double> dist(0.0f, 1.0f);
-    const float rnd = dist(sctx->rng);
+    // nextf returns double, equivalent to std::uniform_real_distribution<double>
+    const float rnd = (float)sctx->rng->nextf();
 
     ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
 }
@@ -1331,201 +1389,24 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_dist_i,
         /* .ctx   = */ new llama_sampler_dist {
-            ("dist"),
+            {"dist"},
             /* .seed        = */ seed,
             /* .seed_cur    = */ seed_cur,
-            /* .rng         = */ std::mt19937(seed_cur),
+            /* .rng         = */ std::make_unique<llama_dist_rng_white>(seed_cur),
             /* .inp_uniform = */ nullptr,
         }
     );
 }
 
-// dist (blue noise)
-
-struct llama_sampler_dist_blue_noise : public llama_sampler_backend {
-    const uint32_t seed;
-          uint32_t seed_cur;
-
-    blue_noise_rng bn_rng;
-
-    ggml_tensor * inp_uniform;
-};
-
-static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) {
-    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-    return sctx->get_name();
-}
-
-static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-
-    // edge cases
-    if (cur_p->size == 0) {
-        cur_p->selected = -1;
-        return;
-    }
-
-    cur_p->selected = 0;
-
-    if (cur_p->size == 1) {
-        cur_p->data[0].p = 1.0f;
-        return;
-    }
-
-    // max logit for numerical stability
-    float max_l = cur_p->data[0].logit;
-    if (!cur_p->sorted) {
-        for (size_t i = 1; i < cur_p->size; ++i) {
-            max_l = std::max(max_l, cur_p->data[i].logit);
-        }
-    }
-
-    // apply softmax to obtain the probabilities
-    double sum_cum = 0.0f;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        float p = expf(cur_p->data[i].logit - max_l);
-        cur_p->data[i].p = p;
-        sum_cum += p;
-    }
-
-    // sample using blue noise RNG
-    const double rnd = ctx->bn_rng.nextf();
-
-          double sum_run = 0.0f;
-    const double sum_tgt = sum_cum*rnd;
-
-    bool found = false;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (!found) {
-            sum_run += cur_p->data[i].p;
-            if (sum_run >= sum_tgt) {
-                cur_p->selected = i;
-                found = true;
-            }
-        }
-
-        // normalize probs
-        cur_p->data[i].p /= sum_cum;
-    }
-
-    assert(found);
-    if (!found) {
-        cur_p->selected = cur_p->size - 1;
-    }
-}
-
-static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-    ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->bn_rng.init(16, ctx->seed_cur);
-}
-
-static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx;
-    auto * result = llama_sampler_init_dist_blue_noise(ctx->seed);
-
-    // copy the state
-    {
-        auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx;
-
-        result_ctx->seed_cur = ctx->seed_cur;
-        result_ctx->bn_rng   = ctx->bn_rng;
-    }
-
-    return result;
-}
-
-static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_dist_blue_noise *) smpl->ctx;
-}
-
-static bool llama_sampler_dist_blue_noise_backend_init(
-        struct llama_sampler       * smpl,
-        ggml_backend_buffer_type_t   buft) {
-    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-
-    const bool res = llama_sampler_backend_support(smpl, buft);
-
-    sctx->init(res);
-
-    return res;
-}
-
-static void llama_sampler_dist_blue_noise_backend_apply(
-        struct llama_sampler      * smpl,
-        struct ggml_context       * ctx,
-        struct ggml_cgraph        * gf,
-        struct llama_sampler_data * data) {
-    GGML_UNUSED(gf);
-
-    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-
-    sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-    ggml_set_name (sctx->inp_uniform, "uniform");
-    ggml_set_input(sctx->inp_uniform);
-
-    struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
-    ggml_set_name(probs, "dist_probs");
-
-    struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs);
-    ggml_set_name(cumsum, "dist_cumsum");
-
-    struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform);
-    ggml_set_name(diff, "dist_cumsum");
-
-    struct ggml_tensor * mask = ggml_step(ctx, diff);
-    ggml_set_name(mask, "dist_mask");
-
-    struct ggml_tensor * idxf = ggml_sum(ctx, mask);
-    ggml_set_name(idxf, "dist_index_f32");
-
-    struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
-    ggml_set_name(idx, "dist_index_i32");
-
-    struct ggml_tensor * sampled_token = idx;
-    if (data->candidates != nullptr) {
-        struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates));
-
-        sampled_token = ggml_get_rows(ctx, candidates, idx);
-        ggml_set_name(sampled_token, "dist_sampled_token");
-    }
-
-    data->sampled = sampled_token;
-    data->probs = probs;
-}
-
-static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) {
-    auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx;
-
-    GGML_ASSERT(sctx->inp_uniform != nullptr);
-
-    const float rnd = (float)sctx->bn_rng.nextf();
-
-    ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
-}
-
-static struct llama_sampler_i llama_sampler_dist_blue_noise_i = {
-    /* .name              = */ llama_sampler_dist_blue_noise_name,
-    /* .accept            = */ nullptr,
-    /* .apply             = */ llama_sampler_dist_blue_noise_apply,
-    /* .reset             = */ llama_sampler_dist_blue_noise_reset,
-    /* .clone             = */ llama_sampler_dist_blue_noise_clone,
-    /* .free              = */ llama_sampler_dist_blue_noise_free,
-    /* .backend_init      = */ llama_sampler_dist_blue_noise_backend_init,
-    /* .backend_accept    = */ nullptr,
-    /* .backend_apply     = */ llama_sampler_dist_blue_noise_backend_apply,
-    /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input,
-};
-
 struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) {
     auto seed_cur = get_rng_seed(seed);
     return llama_sampler_init(
-        /* .iface = */ &llama_sampler_dist_blue_noise_i,
-        /* .ctx   = */ new llama_sampler_dist_blue_noise {
-            ("dist-blue-noise"),
+        /* .iface = */ &llama_sampler_dist_i,
+        /* .ctx   = */ new llama_sampler_dist {
+            {"dist-blue-noise"},
             /* .seed        = */ seed,
             /* .seed_cur    = */ seed_cur,
-            /* .bn_rng      = */ blue_noise_rng(16, seed_cur),
+            /* .rng         = */ std::make_unique<llama_dist_rng_blue>(seed_cur),
             /* .inp_uniform = */ nullptr,
         }
     );
@@ -4119,10 +4000,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
         return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
     }
 
-    if (smpl->iface == &llama_sampler_dist_blue_noise_i) {
-        return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur;
-    }
-
     if (smpl->iface == &llama_sampler_mirostat_i) {
         return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
     }

From 766d86df29e34f2fdae74e4b37b8875aa02a1906 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Wed, 4 Feb 2026 23:35:22 +0000
Subject: [PATCH 05/25] llama : cleanup and restore alternate code path

---
 src/llama-sampling.cpp | 48 +++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c41666aaa7..060538eb12 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -214,7 +214,8 @@ static void llama_token_data_array_partial_sort_inplace(llama_token_data_array *
     cur_p->sorted = true;
 }
 
-static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+template<typename RNG>
+static int llama_sample_dist(llama_token_data_array * cur_p, RNG & rng) {
     // iterator for the probabilities
 #ifdef __GNUC__
     #pragma GCC diagnostic push
@@ -334,6 +335,10 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 }
 
 // pseudo-random number generator with ~6db/octave blue noise
+// important: blue noise properties cannot be preserved when 
+// the generator is used for multiple purposes simultaneously
+// nor when multiple next calls are used to construct a larger value
+// nor when integer outputs are used with the modulo operator
 struct blue_noise_rng {
     uint8_t  bit_depth = 0;
     uint32_t seed      = 0;
@@ -436,16 +441,38 @@ struct blue_noise_rng {
 struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
+    virtual uint32_t                        rng_min()          = 0;
+    virtual uint32_t                        rng_max()          = 0;
+    virtual uint32_t                        next()         = 0; // uniform bits in [rng_min(), rng_max()]
     virtual double                          nextf()            = 0; // uniform double in [0, 1)
     virtual void                            reseed(uint32_t s) = 0;
     virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
 };
 
+// adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution
+// note: not guaranteed to preserve blue noise properties
+struct llama_dist_urbg {
+    using result_type = uint32_t;
+
+    llama_dist_rng & rng;
+
+    result_type min() { return rng.rng_min(); }
+    result_type max() { return rng.rng_max(); }
+    result_type operator()() { return rng.next(); }
+};
+
 struct llama_dist_rng_white : llama_dist_rng {
     std::mt19937 rng;
 
     llama_dist_rng_white(uint32_t seed) : rng(seed) {}
 
+    uint32_t rng_min() override { return std::mt19937::min(); }
+    uint32_t rng_max() override { return std::mt19937::max(); }
+
+    uint32_t next() override {
+        return rng();
+    }
+
     double nextf() override {
         std::uniform_real_distribution<double> dist(0.0, 1.0);
         return dist(rng);
@@ -467,6 +494,13 @@ struct llama_dist_rng_blue : llama_dist_rng {
 
     llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {}
 
+    uint32_t rng_min() override { return 0; }
+    uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; }
+
+    uint32_t next() override {
+        return bn_rng.next();
+    }
+
     double nextf() override {
         return bn_rng.nextf();
     }
@@ -1249,15 +1283,9 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
         cur_p->data[i].p /= sum_cum;
     }
 
-    const double rnd = ctx->rng->nextf();
-    double cum = 0.0;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cum += cur_p->data[i].p;
-        if (cum >= rnd) {
-            cur_p->selected = i;
-            break;
-        }
-    }
+    // this implementation is not guaranteed to preserve blue noise properties
+    llama_dist_urbg urbg{*ctx->rng};
+    cur_p->selected = llama_sample_dist(cur_p, urbg);
 #endif
 }
 

From ad73188337ad45a60d404d8cdb97a0eaee6e2599 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Thu, 5 Feb 2026 00:53:51 +0000
Subject: [PATCH 06/25] llama : note on blue noise properties

---
 src/llama-sampling.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 060538eb12..408bdacccc 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -335,6 +335,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 }
 
 // pseudo-random number generator with ~6db/octave blue noise
+// this generator produces a uniform distribution
 // important: blue noise properties cannot be preserved when 
 // the generator is used for multiple purposes simultaneously
 // nor when multiple next calls are used to construct a larger value

From d5def78bb0b32c686e8d566493acead71d8f0535 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Thu, 5 Feb 2026 03:15:55 +0000
Subject: [PATCH 07/25] llama : note on blue noise

---
 src/llama-sampling.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 408bdacccc..9b651c816d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -334,6 +334,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
+// generative error diffusion for sequential blue noise
 // pseudo-random number generator with ~6db/octave blue noise
 // this generator produces a uniform distribution
 // important: blue noise properties cannot be preserved when 

From e829f2904e3ea76e8ce8982eee9fc1adc6a95602 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Thu, 5 Feb 2026 10:46:18 +0000
Subject: [PATCH 08/25] sampling : blue noise requires tokens to be sorted

---
 src/llama-sampling.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 9b651c816d..0e7d4cb178 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -443,9 +443,10 @@ struct blue_noise_rng {
 struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
+    virtual bool                            requires_sorted()  = 0; // whether the RNG requires sorted input for proper properties
     virtual uint32_t                        rng_min()          = 0;
     virtual uint32_t                        rng_max()          = 0;
-    virtual uint32_t                        next()         = 0; // uniform bits in [rng_min(), rng_max()]
+    virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
     virtual double                          nextf()            = 0; // uniform double in [0, 1)
     virtual void                            reseed(uint32_t s) = 0;
     virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
@@ -468,6 +469,8 @@ struct llama_dist_rng_white : llama_dist_rng {
 
     llama_dist_rng_white(uint32_t seed) : rng(seed) {}
 
+    bool requires_sorted() override { return false; }
+
     uint32_t rng_min() override { return std::mt19937::min(); }
     uint32_t rng_max() override { return std::mt19937::max(); }
 
@@ -496,6 +499,8 @@ struct llama_dist_rng_blue : llama_dist_rng {
 
     llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {}
 
+    bool requires_sorted() override { return true; }
+
     uint32_t rng_min() override { return 0; }
     uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; }
 
@@ -1234,6 +1239,11 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
         return;
     }
 
+    // sort if required by the RNG (e.g., blue noise needs sorted input for proper temporal properties)
+    if (ctx->rng->requires_sorted() && !cur_p->sorted) {
+        llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
+    }
+
     // max logit for numerical stability
     float max_l = cur_p->data[0].logit;
     if (!cur_p->sorted) {

From 267cd808a2adff78004b0f96bcde9b2b7d37f0f1 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Sat, 7 Feb 2026 02:42:21 +0000
Subject: [PATCH 09/25] sampling : cleanup blue noise rng with some more notes

---
 src/llama-sampler.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 9db31d4a2c..afa5fe16be 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -355,6 +355,10 @@ struct blue_noise_rng {
         init(bit_depth, seed);
     }
 
+    // currently this uses lowbias32 as the white noise RNG source
+    // in practice, any white noise RNG source works
+    // this random noise is used to perturb the error diffusion weights (binary decision)
+    // as well as to fill in the low bits of the double precision output to eliminate aliasing
     static uint32_t hash(uint32_t x) { // lowbias32
         x ^= x >> 16; x *= 0x21f0aaad;
         x ^= x >> 15; x *= 0x735a2d97;
@@ -377,6 +381,7 @@ struct blue_noise_rng {
         position  = 0;
 
         // 5 reachable states with distribution 3:3:2:1:1
+        // established based on empirical testing
         static const int8_t tbl[10][2] = {
             { 0,  0}, { 0,  0}, { 0,  0},
             {-1,  0}, {-1,  0}, {-1,  0},
@@ -444,9 +449,13 @@ struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
     virtual bool                            requires_sorted()  = 0; // whether the RNG requires sorted input for proper properties
+    
+    // for compatilibility with std::discrete_distribution
+    // nly used in a disabled branch of llama_sampler_dist_apply
     virtual uint32_t                        rng_min()          = 0;
     virtual uint32_t                        rng_max()          = 0;
     virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
+
     virtual double                          nextf()            = 0; // uniform double in [0, 1)
     virtual void                            reseed(uint32_t s) = 0;
     virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
@@ -454,6 +463,7 @@ struct llama_dist_rng {
 
 // adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution
 // note: not guaranteed to preserve blue noise properties
+// this is only used in a disabled branch of llama_sampler_dist_apply, added for compatibility
 struct llama_dist_urbg {
     using result_type = uint32_t;
 
@@ -464,10 +474,10 @@ struct llama_dist_urbg {
     result_type operator()() { return rng.next(); }
 };
 
-struct llama_dist_rng_white : llama_dist_rng {
+struct llama_dist_rng_mt19937 : llama_dist_rng {
     std::mt19937 rng;
 
-    llama_dist_rng_white(uint32_t seed) : rng(seed) {}
+    llama_dist_rng_mt19937(uint32_t seed) : rng(seed) {}
 
     bool requires_sorted() override { return false; }
 
@@ -488,7 +498,7 @@ struct llama_dist_rng_white : llama_dist_rng {
     }
 
     std::unique_ptr<llama_dist_rng> clone() const override {
-        auto c = std::make_unique<llama_dist_rng_white>(0);
+        auto c = std::make_unique<llama_dist_rng_mt19937>(0);
         c->rng = rng;
         return c;
     }
@@ -1432,7 +1442,7 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
             {"dist"},
             /* .seed        = */ seed,
             /* .seed_cur    = */ seed_cur,
-            /* .rng         = */ std::make_unique<llama_dist_rng_white>(seed_cur),
+            /* .rng         = */ std::make_unique<llama_dist_rng_mt19937>(seed_cur),
             /* .inp_uniform = */ nullptr,
         }
     );

From 1b1b2cbe0e2660a88b98f0671bfa98cec7213530 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Sat, 7 Feb 2026 04:27:52 +0000
Subject: [PATCH 10/25] sampling : also apply sorting in backend path when blue
 noise rng is selected

---
 src/llama-sampler.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index afa5fe16be..dc7394ae51 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -1361,6 +1361,30 @@ static void llama_sampler_dist_backend_apply(
     ggml_set_name (sctx->inp_uniform, "uniform");
     ggml_set_input(sctx->inp_uniform);
 
+    // If the RNG requires sorted input (e.g., blue noise), sort logits first
+    // so the CDF walk operates in probability-rank space, not arbitrary vocab order.
+    if (sctx->rng->requires_sorted()) {
+        auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
+            GGML_ASSERT(ggml_nrows(a) == 1);
+            struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]);
+            struct ggml_tensor * a_sorted   = ggml_get_rows(ctx, a_reshaped, b);
+            return ggml_reshape_1d(ctx, a_sorted, a->ne[0]);
+        };
+
+        struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC);
+        ggml_set_name(sorted_idx, "dist_sorted_idx");
+
+        data->logits = ggml_sort(data->logits, sorted_idx);
+        ggml_set_name(data->logits, "dist_sorted_logits");
+
+        if (data->candidates) {
+            data->candidates = ggml_sort(data->candidates, sorted_idx);
+        } else {
+            data->candidates = sorted_idx;
+        }
+        ggml_set_name(data->candidates, "dist_sorted_candidates");
+    }
+
     struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
     ggml_set_name(probs, "dist_probs");
 

From 15ade86a75d1b26f61be1d084f004e5df00d421f Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Sat, 7 Feb 2026 04:49:37 +0000
Subject: [PATCH 11/25] sampling : simplify clone

---
 src/llama-sampler.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index dc7394ae51..0aa000f319 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -498,9 +498,7 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
     }
 
     std::unique_ptr<llama_dist_rng> clone() const override {
-        auto c = std::make_unique<llama_dist_rng_mt19937>(0);
-        c->rng = rng;
-        return c;
+        return std::make_unique<llama_dist_rng_mt19937>(*this);
     }
 };
 
@@ -527,9 +525,7 @@ struct llama_dist_rng_blue : llama_dist_rng {
     }
 
     std::unique_ptr<llama_dist_rng> clone() const override {
-        auto c = std::make_unique<llama_dist_rng_blue>(0);
-        c->bn_rng = bn_rng;
-        return c;
+        return std::make_unique<llama_dist_rng_blue>(*this);
     }
 };
 

From a0323a989df46cacc0c17f5577ec44080b702f28 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Sat, 7 Feb 2026 05:03:21 +0000
Subject: [PATCH 12/25] sampling : comment on state use

---
 src/llama-sampler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 0aa000f319..0d1586508e 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -371,7 +371,7 @@ struct blue_noise_rng {
         seed      = hash(s);
 
         const int n = (1 << bit_depth) - 1;
-        states.resize(n);
+        states.resize(n); // at 16-bit depth, this uses 128KB of state
 
         reset();
     }

From 23b5a5c026f77e0a51361e7dc5ec3292f481f65f Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Sat, 7 Feb 2026 09:19:23 +0000
Subject: [PATCH 13/25] sampling : fix whitespace

---
 src/llama-sampler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 0d1586508e..275e4c5b56 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -337,7 +337,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 // generative error diffusion for sequential blue noise
 // pseudo-random number generator with ~6db/octave blue noise
 // this generator produces a uniform distribution
-// important: blue noise properties cannot be preserved when 
+// important: blue noise properties cannot be preserved when
 // the generator is used for multiple purposes simultaneously
 // nor when multiple next calls are used to construct a larger value
 // nor when integer outputs are used with the modulo operator
@@ -449,7 +449,7 @@ struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
     virtual bool                            requires_sorted()  = 0; // whether the RNG requires sorted input for proper properties
-    
+
     // for compatilibility with std::discrete_distribution
     // nly used in a disabled branch of llama_sampler_dist_apply
     virtual uint32_t                        rng_min()          = 0;

From 7f433763b6683b7f6e5a323729acf73a1a4e8ec9 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 02:47:42 +0000
Subject: [PATCH 14/25] sampling : implement disabled branch to support blue
 noise

---
 src/llama-sampler.cpp | 54 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 275e4c5b56..02258d981b 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -448,10 +448,12 @@ struct blue_noise_rng {
 struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
-    virtual bool                            requires_sorted()  = 0; // whether the RNG requires sorted input for proper properties
+    // whether the RNG requires sorted input for proper properties
+    // this also indicates whether the RNG output itself must be consumed in a coherent order
+    virtual bool                            requires_sorted()  = 0;
 
-    // for compatilibility with std::discrete_distribution
-    // nly used in a disabled branch of llama_sampler_dist_apply
+    // for compatibility with std::discrete_distribution
+    // only used in a disabled branch of llama_sampler_dist_apply
     virtual uint32_t                        rng_min()          = 0;
     virtual uint32_t                        rng_max()          = 0;
     virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
@@ -474,6 +476,48 @@ struct llama_dist_urbg {
     result_type operator()() { return rng.next(); }
 };
 
+// wrapper to use existing llama_sample_dist for mt19937, otherwise implements CDF walk directly
+// this is currently only used in a disabled branch of llama_sampler_dist_apply, added for compatibility and potential use by other samplers
+// flag normalized to skip recomputing the probability sum when probs already sum to 1
+static int llama_sample_dist_rng(llama_token_data_array * cur_p, llama_dist_rng & rng, bool normalized = false) {
+    if (!rng.requires_sorted()) {
+        llama_dist_urbg urbg{rng};
+        return llama_sample_dist(cur_p, urbg);
+    }
+
+    if (!cur_p->sorted) {
+        llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
+    }
+    const double rnd = rng.nextf();
+
+    double sum_run = 0.0;
+
+    if (normalized) {
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            sum_run += cur_p->data[i].p;
+            if (sum_run >= rnd) {
+                return i;
+            }
+        }
+    } else {
+        double sum_cum = 0.0;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            sum_cum += cur_p->data[i].p;
+        }
+
+        const double sum_tgt = sum_cum * rnd;
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            sum_run += cur_p->data[i].p;
+            if (sum_run >= sum_tgt) {
+                return i;
+            }
+        }
+    }
+
+    return (int)(cur_p->size - 1);
+}
+
 struct llama_dist_rng_mt19937 : llama_dist_rng {
     std::mt19937 rng;
 
@@ -1301,9 +1345,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
         cur_p->data[i].p /= sum_cum;
     }
 
-    // this implementation is not guaranteed to preserve blue noise properties
-    llama_dist_urbg urbg{*ctx->rng};
-    cur_p->selected = llama_sample_dist(cur_p, urbg);
+    cur_p->selected = llama_sample_dist_rng(cur_p, *ctx->rng, true);
 #endif
 }
 

From ae31b151e9a1ba8b1b17ecda07a26f3084869ed7 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 03:27:12 +0000
Subject: [PATCH 15/25] sampling : cleaner approach for constructing floating
 point value

---
 src/llama-sampler.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 02258d981b..ad0a20f0ff 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -434,13 +434,10 @@ struct blue_noise_rng {
 
     // uniform double in [0, 1) with blue noise temporal autocorrelation
     double nextf() {
-        double res = 0.0;
-        res += hash(position ^ ~seed); // fill low bits with white noise
-        res *= 1.0 / 4294967296.0;
-        res += next32();
-        res *= 1.0 / 4294967296.0;
-        if (res >= 1.0) res = std::nextafter(1.0, 0.0);
-        return res;
+        uint32_t lo = hash(position ^ ~seed); // white noise low bits
+        uint32_t hi = next32();               // blue noise high bits
+        uint64_t combined = ((uint64_t)hi << 32) | lo;
+        return (combined >> 11) * 0x1.0p-53;
     }
 };
 

From 2c7269fd8da3f4b7899aed5f6754520bcc069a4f Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 04:03:10 +0000
Subject: [PATCH 16/25] sampling : make white noise source for blue noise
 modular as well

---
 src/llama-sampler.cpp | 193 +++++++++++++++++++++++++++++-------------
 1 file changed, 136 insertions(+), 57 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index ad0a20f0ff..2ddb2978eb 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -334,6 +334,27 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
+// abstract RNG interface for the dist sampler
+struct llama_dist_rng {
+    virtual ~llama_dist_rng() = default;
+
+    // whether the RNG requires sorted input for proper properties
+    // this also indicates whether the RNG output itself must be consumed in a coherent order
+    virtual bool                            requires_sorted()  = 0;
+
+    // for compatibility with std::discrete_distribution
+    // only used in a disabled branch of llama_sampler_dist_apply
+    virtual uint32_t                        rng_min()          = 0;
+    virtual uint32_t                        rng_max()          = 0;
+    virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
+
+    virtual uint32_t                        next32()           = 0; // uniform 32 bits
+    virtual uint64_t                        next64()           = 0; // uniform 64 bits
+    virtual double                          nextf()            = 0; // uniform double in [0, 1)
+    virtual void                            reseed(uint32_t s) = 0;
+    virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
+};
+
 // generative error diffusion for sequential blue noise
 // pseudo-random number generator with ~6db/octave blue noise
 // this generator produces a uniform distribution
@@ -343,32 +364,38 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 // nor when integer outputs are used with the modulo operator
 struct blue_noise_rng {
     uint8_t  bit_depth = 0;
-    uint32_t seed      = 0;
-    uint32_t position  = 0;
+    std::unique_ptr<llama_dist_rng> rng;
 
     // binary tree of 1-bit 50% duty cycle error diffusion dithering blue noise generators
     std::vector<std::array<int8_t, 2>> states; // {err0, err1} per tree node
 
     blue_noise_rng() = default;
 
-    blue_noise_rng(uint8_t bit_depth, uint32_t seed) {
-        init(bit_depth, seed);
+    blue_noise_rng(uint8_t bit_depth, std::unique_ptr<llama_dist_rng> rng) {
+        init(bit_depth, std::move(rng));
     }
 
-    // currently this uses lowbias32 as the white noise RNG source
-    // in practice, any white noise RNG source works
-    // this random noise is used to perturb the error diffusion weights (binary decision)
-    // as well as to fill in the low bits of the double precision output to eliminate aliasing
-    static uint32_t hash(uint32_t x) { // lowbias32
-        x ^= x >> 16; x *= 0x21f0aaad;
-        x ^= x >> 15; x *= 0x735a2d97;
-        x ^= x >> 15;
-        return x;
+    // custom copy (clone the underlying RNG)
+    blue_noise_rng(const blue_noise_rng & other)
+        : bit_depth(other.bit_depth)
+        , rng(other.rng ? other.rng->clone() : nullptr)
+        , states(other.states) {}
+
+    blue_noise_rng & operator=(const blue_noise_rng & other) {
+        if (this != &other) {
+            bit_depth = other.bit_depth;
+            rng       = other.rng ? other.rng->clone() : nullptr;
+            states    = other.states;
+        }
+        return *this;
     }
 
-    void init(uint8_t depth, uint32_t s) {
+    blue_noise_rng(blue_noise_rng &&) = default;
+    blue_noise_rng & operator=(blue_noise_rng &&) = default;
+
+    void init(uint8_t depth, std::unique_ptr<llama_dist_rng> source) {
         bit_depth = std::clamp<uint8_t>(depth, 1, 16);
-        seed      = hash(s);
+        rng       = std::move(source);
 
         const int n = (1 << bit_depth) - 1;
         states.resize(n); // at 16-bit depth, this uses 128KB of state
@@ -376,9 +403,13 @@ struct blue_noise_rng {
         reset();
     }
 
+    void reseed(uint32_t s) {
+        rng->reseed(s);
+        reset();
+    }
+
     void reset() {
         const int n = (int)states.size();
-        position  = 0;
 
         // 5 reachable states with distribution 3:3:2:1:1
         // established based on empirical testing
@@ -390,15 +421,12 @@ struct blue_noise_rng {
             {-1, -1},
         };
         for (int i = 0; i < n; i++) {
-            uint32_t h = hash((uint32_t)i ^ seed) % 10;
+            uint32_t h = rng->next32() % 10;
             states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
         }
     }
 
-    uint16_t next(uint32_t * hash_remainder = nullptr) {
-        uint32_t h = hash(position ^ seed);
-        position++;
-
+    uint16_t advance(uint32_t h) {
         // traverse binary tree, one error diffusion ditherer per population split
         // thresholding output at any value still produces blue noise
         uint32_t acc = 0;
@@ -416,50 +444,39 @@ struct blue_noise_rng {
 
             acc = acc * 2 + out;
         }
-
-        if (hash_remainder) {
-            *hash_remainder = h >> bit_depth; // unused bits from random hash
-        }
-
         return (uint16_t)acc;
     }
 
-    // blue noise in the upper bit_depth bits, white noise hash remainder in the lower bits
+    uint16_t next() {
+        uint32_t h = rng->next32();
+        return advance(h);
+    }
+
+    // blue noise in the upper bit_depth bits, white noise in the lower bits
     // do not use with modulo operator, as it would just produce white noise
     uint32_t next32() {
-        uint32_t rem;
-        uint32_t val = next(&rem);
-        return (val << (32 - bit_depth)) | rem;
+        uint32_t h   = rng->next32();
+        uint32_t val = advance(h);
+        return (val << (32 - bit_depth)) | (h >> bit_depth);
+    }
+
+    // blue noise in the upper bits, white noise in the lower bits
+    uint64_t next64() {
+        uint64_t r   = rng->next64();
+        uint32_t lo  = (uint32_t)r;
+        uint32_t h   = (uint32_t)(r >> 32);
+        uint32_t val = advance(h);
+        uint32_t hi  = (val << (32 - bit_depth)) | (h >> bit_depth);
+        return ((uint64_t)hi << 32) | lo;
     }
 
     // uniform double in [0, 1) with blue noise temporal autocorrelation
     double nextf() {
-        uint32_t lo = hash(position ^ ~seed); // white noise low bits
-        uint32_t hi = next32();               // blue noise high bits
-        uint64_t combined = ((uint64_t)hi << 32) | lo;
+        uint64_t combined = next64();
         return (combined >> 11) * 0x1.0p-53;
     }
 };
 
-// abstract RNG interface for the dist sampler
-struct llama_dist_rng {
-    virtual ~llama_dist_rng() = default;
-
-    // whether the RNG requires sorted input for proper properties
-    // this also indicates whether the RNG output itself must be consumed in a coherent order
-    virtual bool                            requires_sorted()  = 0;
-
-    // for compatibility with std::discrete_distribution
-    // only used in a disabled branch of llama_sampler_dist_apply
-    virtual uint32_t                        rng_min()          = 0;
-    virtual uint32_t                        rng_max()          = 0;
-    virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
-
-    virtual double                          nextf()            = 0; // uniform double in [0, 1)
-    virtual void                            reseed(uint32_t s) = 0;
-    virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
-};
-
 // adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution
 // note: not guaranteed to preserve blue noise properties
 // this is only used in a disabled branch of llama_sampler_dist_apply, added for compatibility
@@ -515,6 +532,55 @@ static int llama_sample_dist_rng(llama_token_data_array * cur_p, llama_dist_rng
     return (int)(cur_p->size - 1);
 }
 
+struct llama_dist_rng_lowbias32 : llama_dist_rng {
+    uint32_t hashed_seed = 0;
+    uint32_t position    = 0;
+
+    llama_dist_rng_lowbias32(uint32_t seed) : hashed_seed(hash(seed)), position(0) {}
+
+    bool requires_sorted() override { return false; }
+    uint32_t rng_min() override { return 0; }
+    uint32_t rng_max() override { return UINT32_MAX; }
+
+    static uint32_t hash(uint32_t x) { // lowbias32
+        // coefficients from https://github.com/skeeto/hash-prospector/issues/19
+        x ^= x >> 16; x *= 0x21f0aaad;
+        x ^= x >> 15; x *= 0x735a2d97;
+        x ^= x >> 15;
+        return x;
+    }
+
+    uint32_t next() override {
+        uint32_t val = hash(position ^ hashed_seed);
+        position++;
+        return val;
+    }
+
+    uint32_t next32() override {
+        return next();
+    }
+
+    uint64_t next64() override {
+        uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed
+        uint64_t hi = next();
+        return (hi << 32) | lo;
+    }
+
+    double nextf() override {
+        uint64_t combined = next64();
+        return (combined >> 11) * 0x1.0p-53;
+    }
+
+    void reseed(uint32_t s) override {
+        hashed_seed = hash(s);
+        position = 0;
+    }
+
+    std::unique_ptr<llama_dist_rng> clone() const override {
+        return std::make_unique<llama_dist_rng_lowbias32>(*this);
+    }
+};
+
 struct llama_dist_rng_mt19937 : llama_dist_rng {
     std::mt19937 rng;
 
@@ -524,11 +590,18 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
 
     uint32_t rng_min() override { return std::mt19937::min(); }
     uint32_t rng_max() override { return std::mt19937::max(); }
+    uint32_t next() override { return rng(); }
 
-    uint32_t next() override {
+    uint32_t next32() override {
         return rng();
     }
 
+    uint64_t next64() override {
+        uint64_t hi = (uint64_t)rng() << 32;
+        uint64_t lo = (uint64_t)rng();
+        return hi | lo;
+    }
+
     double nextf() override {
         std::uniform_real_distribution<double> dist(0.0, 1.0);
         return dist(rng);
@@ -546,15 +619,21 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
 struct llama_dist_rng_blue : llama_dist_rng {
     blue_noise_rng bn_rng;
 
-    llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {}
+    llama_dist_rng_blue(uint32_t seed)
+        : bn_rng(16, std::make_unique<llama_dist_rng_lowbias32>(seed)) {}
 
     bool requires_sorted() override { return true; }
 
     uint32_t rng_min() override { return 0; }
     uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; }
+    uint32_t next() override { return bn_rng.next(); }
 
-    uint32_t next() override {
-        return bn_rng.next();
+    uint32_t next32() override {
+        return bn_rng.next32();
+    }
+
+    uint64_t next64() override {
+        return bn_rng.next64();
     }
 
     double nextf() override {
@@ -562,7 +641,7 @@ struct llama_dist_rng_blue : llama_dist_rng {
     }
 
     void reseed(uint32_t s) override {
-        bn_rng.init(16, s);
+        bn_rng.reseed(s);
     }
 
     std::unique_ptr<llama_dist_rng> clone() const override {

From f3acd240d68b888c7d90ae02337b4f52a3108266 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 04:09:57 +0000
Subject: [PATCH 17/25] sampling : simplify

---
 src/llama-sampler.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 2ddb2978eb..df30a613d5 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -463,11 +463,8 @@ struct blue_noise_rng {
     // blue noise in the upper bits, white noise in the lower bits
     uint64_t next64() {
         uint64_t r   = rng->next64();
-        uint32_t lo  = (uint32_t)r;
-        uint32_t h   = (uint32_t)(r >> 32);
-        uint32_t val = advance(h);
-        uint32_t hi  = (val << (32 - bit_depth)) | (h >> bit_depth);
-        return ((uint64_t)hi << 32) | lo;
+        uint32_t val = advance((uint32_t)r);
+        return ((uint64_t)val << (64 - bit_depth)) | (r >> bit_depth);
     }
 
     // uniform double in [0, 1) with blue noise temporal autocorrelation
@@ -561,8 +558,8 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng {
     }
 
     uint64_t next64() override {
-        uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed
-        uint64_t hi = next();
+        uint64_t hi = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed
+        uint64_t lo = next();
         return (hi << 32) | lo;
     }
 

From 75cb3e8f2eb6e0d6f44559cda9ebc80f2b8b27c0 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 04:17:07 +0000
Subject: [PATCH 18/25] sampling : test against previous implementation

---
 src/llama-sampler.cpp | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index df30a613d5..9f5cba09c2 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -352,6 +352,7 @@ struct llama_dist_rng {
     virtual uint64_t                        next64()           = 0; // uniform 64 bits
     virtual double                          nextf()            = 0; // uniform double in [0, 1)
     virtual void                            reseed(uint32_t s) = 0;
+    virtual void                            reset()            = 0; // reset to post-seed state
     virtual std::unique_ptr<llama_dist_rng> clone() const      = 0;
 };
 
@@ -400,15 +401,15 @@ struct blue_noise_rng {
         const int n = (1 << bit_depth) - 1;
         states.resize(n); // at 16-bit depth, this uses 128KB of state
 
-        reset();
+        reset_states();
     }
 
     void reseed(uint32_t s) {
         rng->reseed(s);
-        reset();
+        reset_states();
     }
 
-    void reset() {
+    void reset_states() {
         const int n = (int)states.size();
 
         // 5 reachable states with distribution 3:3:2:1:1
@@ -424,6 +425,8 @@ struct blue_noise_rng {
             uint32_t h = rng->next32() % 10;
             states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
         }
+
+        rng->reset(); // reset position so generation starts from 0
     }
 
     uint16_t advance(uint32_t h) {
@@ -573,15 +576,20 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng {
         position = 0;
     }
 
+    void reset() override {
+        position = 0;
+    }
+
     std::unique_ptr<llama_dist_rng> clone() const override {
         return std::make_unique<llama_dist_rng_lowbias32>(*this);
     }
 };
 
 struct llama_dist_rng_mt19937 : llama_dist_rng {
+    uint32_t     seed;
     std::mt19937 rng;
 
-    llama_dist_rng_mt19937(uint32_t seed) : rng(seed) {}
+    llama_dist_rng_mt19937(uint32_t seed) : seed(seed), rng(seed) {}
 
     bool requires_sorted() override { return false; }
 
@@ -605,9 +613,14 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
     }
 
     void reseed(uint32_t s) override {
+        seed = s;
         rng.seed(s);
     }
 
+    void reset() override {
+        rng.seed(seed);
+    }
+
     std::unique_ptr<llama_dist_rng> clone() const override {
         return std::make_unique<llama_dist_rng_mt19937>(*this);
     }
@@ -641,6 +654,11 @@ struct llama_dist_rng_blue : llama_dist_rng {
         bn_rng.reseed(s);
     }
 
+    void reset() override {
+        bn_rng.rng->reset();
+        bn_rng.reset_states();
+    }
+
     std::unique_ptr<llama_dist_rng> clone() const override {
         return std::make_unique<llama_dist_rng_blue>(*this);
     }

From 7bb5d4b8907a0092cecbd4799395a664a95695b5 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 04:30:31 +0000
Subject: [PATCH 19/25] sampling : disable testing code

---
 src/llama-sampler.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 9f5cba09c2..34f8a62ab4 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -339,7 +339,7 @@ struct llama_dist_rng {
     virtual ~llama_dist_rng() = default;
 
     // whether the RNG requires sorted input for proper properties
-    // this also indicates whether the RNG output itself must be consumed in a coherent order
+    // this also indicates whether the RNG output itself must be consumed in a sequential order
     virtual bool                            requires_sorted()  = 0;
 
     // for compatibility with std::discrete_distribution
@@ -426,7 +426,11 @@ struct blue_noise_rng {
             states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
         }
 
+#if 0
+        // test against initial implementation outputs
+        // note: white noise padding in next64 is slightly different, but minimally consequential for testing
         rng->reset(); // reset position so generation starts from 0
+#endif
     }
 
     uint16_t advance(uint32_t h) {

From 2826de3189d8f3a0e95961be7884fe30ec46cf4a Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 05:23:58 +0000
Subject: [PATCH 20/25] sampling : make rng selection fully modular

---
 common/arg.cpp               | 13 +++++++++++++
 common/common.h              |  1 +
 common/sampling.cpp          | 13 ++++++-------
 include/llama.h              |  9 +++++++--
 src/llama-sampler.cpp        | 34 ++++++++++++++++++----------------
 tools/server/server-task.cpp | 10 ++++++++++
 6 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 924b5198a2..7181e31cd7 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1584,6 +1584,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.blue_noise = true;
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--rng-type"}, "{mt19937,lowbias32}",
+        "RNG type for sampling (default: mt19937)",
+        [](common_params & params, const std::string & value) {
+            if (value == "mt19937") {
+                params.sampling.rng_type = LLAMA_RNG_TYPE_MT19937;
+            } else if (value == "lowbias32") {
+                params.sampling.rng_type = LLAMA_RNG_TYPE_LOWBIAS32;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
         string_format("temperature (default: %.2f)", (double)params.sampling.temp),
diff --git a/common/common.h b/common/common.h
index 0a76a1e26c..662eeb51e2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -210,6 +210,7 @@ struct common_params_sampling {
     bool    no_perf            = false;  // disable performance metrics
     bool    timing_per_token   = false;
     bool    blue_noise         = false;  // use blue noise RNG instead of white noise for dist sampler
+    enum llama_rng_type rng_type = LLAMA_RNG_TYPE_MT19937; // RNG type for dist sampler
 
     uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 2811eb3a48..f98bd7b311 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -167,11 +167,14 @@ std::string common_params_sampling::print() const {
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
             "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f\n"
+            "\tblue_noise = %s, rng_type = %s",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
             dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
             top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
-            mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
+            mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay,
+            blue_noise ? "true" : "false",
+            rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937");
 
     return std::string(result);
 }
@@ -313,11 +316,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
         } else {
             // default: sample from distribution
-            if (params.blue_noise) {
-                samplers.push_back(llama_sampler_init_dist_blue_noise(params.seed));
-            } else {
-                samplers.push_back(llama_sampler_init_dist(params.seed));
-            }
+            samplers.push_back(llama_sampler_init_dist_rng(params.seed, params.blue_noise, params.rng_type));
         }
     } else if (params.mirostat == 1) {
         samplers.push_back(llama_sampler_init_temp(params.temp));
diff --git a/include/llama.h b/include/llama.h
index 22f08e1683..d9f4acc5c7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -188,6 +188,11 @@ extern "C" {
 
     LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
 
+    enum llama_rng_type {
+        LLAMA_RNG_TYPE_MT19937   = 0,
+        LLAMA_RNG_TYPE_LOWBIAS32 = 1,
+    };
+
     enum llama_split_mode {
         LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
         LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -1295,8 +1300,8 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 
     /// seed == LLAMA_DEFAULT_SEED to use a random seed.
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist           (uint32_t seed);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist    (uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// Setting k <= 0 makes this a noop
diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 34f8a62ab4..0a74f2d26f 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -633,8 +633,8 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
 struct llama_dist_rng_blue : llama_dist_rng {
     blue_noise_rng bn_rng;
 
-    llama_dist_rng_blue(uint32_t seed)
-        : bn_rng(16, std::make_unique<llama_dist_rng_lowbias32>(seed)) {}
+    llama_dist_rng_blue(std::unique_ptr<llama_dist_rng> source)
+        : bn_rng(16, std::move(source)) {}
 
     bool requires_sorted() override { return true; }
 
@@ -1591,32 +1591,34 @@ static struct llama_sampler_i llama_sampler_dist_i = {
     /* .backend_set_input = */ llama_sampler_dist_backend_set_input,
 };
 
-struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+static std::unique_ptr<llama_dist_rng> make_dist_rng(uint32_t seed, enum llama_rng_type rng_type) {
+    switch (rng_type) {
+        case LLAMA_RNG_TYPE_LOWBIAS32: return std::make_unique<llama_dist_rng_lowbias32>(seed);
+        case LLAMA_RNG_TYPE_MT19937:
+        default:                       return std::make_unique<llama_dist_rng_mt19937>(seed);
+    }
+}
+
+struct llama_sampler * llama_sampler_init_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type) {
     auto seed_cur = get_rng_seed(seed);
+    auto rng = make_dist_rng(seed_cur, rng_type);
+    if (blue_noise) {
+        rng = std::make_unique<llama_dist_rng_blue>(std::move(rng));
+    }
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_dist_i,
         /* .ctx   = */ new llama_sampler_dist {
             {"dist"},
             /* .seed        = */ seed,
             /* .seed_cur    = */ seed_cur,
-            /* .rng         = */ std::make_unique<llama_dist_rng_mt19937>(seed_cur),
+            /* .rng         = */ std::move(rng),
             /* .inp_uniform = */ nullptr,
         }
     );
 }
 
-struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) {
-    auto seed_cur = get_rng_seed(seed);
-    return llama_sampler_init(
-        /* .iface = */ &llama_sampler_dist_i,
-        /* .ctx   = */ new llama_sampler_dist {
-            {"dist-blue-noise"},
-            /* .seed        = */ seed,
-            /* .seed_cur    = */ seed_cur,
-            /* .rng         = */ std::make_unique<llama_dist_rng_blue>(seed_cur),
-            /* .inp_uniform = */ nullptr,
-        }
-    );
+struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    return llama_sampler_init_dist_rng(seed, false, LLAMA_RNG_TYPE_MT19937);
 }
 
 // top-k
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 16c3cf12d0..d717165daa 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -67,6 +67,7 @@ json task_params::to_json(bool only_metrics) const {
             {"n_discard",                 n_discard},
             {"ignore_eos",                sampling.ignore_eos},
             {"blue_noise",                sampling.blue_noise},
+            {"rng_type",                sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
             {"stream",                    stream},
             {"n_probs",                   sampling.n_probs},
             {"min_keep",                  sampling.min_keep},
@@ -127,6 +128,7 @@ json task_params::to_json(bool only_metrics) const {
         {"n_discard",                 n_discard},
         {"ignore_eos",                sampling.ignore_eos},
         {"blue_noise",                sampling.blue_noise},
+        {"rng_type",                sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
         {"stream",                    stream},
         {"logit_bias",                format_logit_bias(sampling.logit_bias)},
         {"n_probs",                   sampling.n_probs},
@@ -470,6 +472,14 @@ task_params server_task::params_from_json_cmpl(
         }
 
         params.sampling.blue_noise  = json_value(data, "blue_noise",  params_base.sampling.blue_noise);
+        {
+            const auto rng_source = json_value(data, "rng_type", std::string(""));
+            if (rng_source == "lowbias32") {
+                params.sampling.rng_type = LLAMA_RNG_TYPE_LOWBIAS32;
+            } else if (rng_source == "mt19937") {
+                params.sampling.rng_type = LLAMA_RNG_TYPE_MT19937;
+            }
+        }
         params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
         if (params.sampling.ignore_eos) {
             params.sampling.logit_bias.insert(

From e896007ad1d4ce9bfd24d039fc106d895afc7fed Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 05:27:13 +0000
Subject: [PATCH 21/25] sampling : fix whitespace

---
 tools/server/server-task.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index d717165daa..fdeddbf21d 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -67,7 +67,7 @@ json task_params::to_json(bool only_metrics) const {
             {"n_discard",                 n_discard},
             {"ignore_eos",                sampling.ignore_eos},
             {"blue_noise",                sampling.blue_noise},
-            {"rng_type",                sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
+            {"rng_type",                  sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
             {"stream",                    stream},
             {"n_probs",                   sampling.n_probs},
             {"min_keep",                  sampling.min_keep},
@@ -128,7 +128,7 @@ json task_params::to_json(bool only_metrics) const {
         {"n_discard",                 n_discard},
         {"ignore_eos",                sampling.ignore_eos},
         {"blue_noise",                sampling.blue_noise},
-        {"rng_type",                sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
+        {"rng_type",                  sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"},
         {"stream",                    stream},
         {"logit_bias",                format_logit_bias(sampling.logit_bias)},
         {"n_probs",                   sampling.n_probs},

From a4858de4e49785af40a9e9fa68a41ac8bcd2d4d3 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 05:36:32 +0000
Subject: [PATCH 22/25] sampling : build fix and cleanup

---
 src/llama-sampler.cpp | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 0a74f2d26f..f06c76077b 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -342,12 +342,6 @@ struct llama_dist_rng {
     // this also indicates whether the RNG output itself must be consumed in a sequential order
     virtual bool                            requires_sorted()  = 0;
 
-    // for compatibility with std::discrete_distribution
-    // only used in a disabled branch of llama_sampler_dist_apply
-    virtual uint32_t                        rng_min()          = 0;
-    virtual uint32_t                        rng_max()          = 0;
-    virtual uint32_t                        next()             = 0; // uniform bits in [rng_min(), rng_max()]
-
     virtual uint32_t                        next32()           = 0; // uniform 32 bits
     virtual uint64_t                        next64()           = 0; // uniform 64 bits
     virtual double                          nextf()            = 0; // uniform double in [0, 1)
@@ -489,9 +483,9 @@ struct llama_dist_urbg {
 
     llama_dist_rng & rng;
 
-    result_type min() { return rng.rng_min(); }
-    result_type max() { return rng.rng_max(); }
-    result_type operator()() { return rng.next(); }
+    static constexpr result_type min() { return 0; }
+    static constexpr result_type max() { return UINT32_MAX; }
+    result_type operator()() { return rng.next32(); }
 };
 
 // wrapper to use existing llama_sample_dist for mt19937, otherwise implements CDF walk directly
@@ -543,8 +537,6 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng {
     llama_dist_rng_lowbias32(uint32_t seed) : hashed_seed(hash(seed)), position(0) {}
 
     bool requires_sorted() override { return false; }
-    uint32_t rng_min() override { return 0; }
-    uint32_t rng_max() override { return UINT32_MAX; }
 
     static uint32_t hash(uint32_t x) { // lowbias32
         // coefficients from https://github.com/skeeto/hash-prospector/issues/19
@@ -554,7 +546,7 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng {
         return x;
     }
 
-    uint32_t next() override {
+    uint32_t next() {
         uint32_t val = hash(position ^ hashed_seed);
         position++;
         return val;
@@ -597,10 +589,6 @@ struct llama_dist_rng_mt19937 : llama_dist_rng {
 
     bool requires_sorted() override { return false; }
 
-    uint32_t rng_min() override { return std::mt19937::min(); }
-    uint32_t rng_max() override { return std::mt19937::max(); }
-    uint32_t next() override { return rng(); }
-
     uint32_t next32() override {
         return rng();
     }
@@ -638,10 +626,6 @@ struct llama_dist_rng_blue : llama_dist_rng {
 
     bool requires_sorted() override { return true; }
 
-    uint32_t rng_min() override { return 0; }
-    uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; }
-    uint32_t next() override { return bn_rng.next(); }
-
     uint32_t next32() override {
         return bn_rng.next32();
     }

From 2a74c288c83a5d9d2e6fa7f956f096bb5ef1b4db Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 09:01:45 +0000
Subject: [PATCH 23/25] sampling : prefer high bits as source for generating
 blue noise

---
 src/llama-sampler.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index f06c76077b..717e1e73d5 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -419,12 +419,6 @@ struct blue_noise_rng {
             uint32_t h = rng->next32() % 10;
             states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
         }
-
-#if 0
-        // test against initial implementation outputs
-        // note: white noise padding in next64 is slightly different, but minimally consequential for testing
-        rng->reset(); // reset position so generation starts from 0
-#endif
     }
 
     uint16_t advance(uint32_t h) {
@@ -441,7 +435,7 @@ struct blue_noise_rng {
             s[1] = 0;
 
             // error diffusion dithering using binary weight perturbation
-            s[(h >> level) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2
+            s[(h >> (31 - level)) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2
 
             acc = acc * 2 + out;
         }
@@ -458,14 +452,14 @@ struct blue_noise_rng {
     uint32_t next32() {
         uint32_t h   = rng->next32();
         uint32_t val = advance(h);
-        return (val << (32 - bit_depth)) | (h >> bit_depth);
+        return (val << (32 - bit_depth)) | (h & ((1u << (32 - bit_depth)) - 1));
     }
 
     // blue noise in the upper bits, white noise in the lower bits
     uint64_t next64() {
         uint64_t r   = rng->next64();
-        uint32_t val = advance((uint32_t)r);
-        return ((uint64_t)val << (64 - bit_depth)) | (r >> bit_depth);
+        uint32_t val = advance((uint32_t)(r >> 32));
+        return ((uint64_t)val << (64 - bit_depth)) | (r & ((UINT64_C(1) << (64 - bit_depth)) - 1));
     }
 
     // uniform double in [0, 1) with blue noise temporal autocorrelation
@@ -557,8 +551,8 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng {
     }
 
     uint64_t next64() override {
-        uint64_t hi = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed
-        uint64_t lo = next();
+        uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed
+        uint64_t hi = next();
         return (hi << 32) | lo;
     }
 

From 10179a636d41f63870148c941a78c753944dfc9f Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 12:55:54 +0000
Subject: [PATCH 24/25] sampling : also use upper bits for initializing state

---
 src/llama-sampler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp
index 717e1e73d5..633be9bceb 100644
--- a/src/llama-sampler.cpp
+++ b/src/llama-sampler.cpp
@@ -416,7 +416,7 @@ struct blue_noise_rng {
             {-1, -1},
         };
         for (int i = 0; i < n; i++) {
-            uint32_t h = rng->next32() % 10;
+            uint32_t h = (uint32_t)(((uint64_t)rng->next32() * 10) >> 32);
             states[i] = {tbl[h][0], tbl[h][1]}; // random initial state
         }
     }

From 1f42650078e2f5f36d2ec03789ac392da897e9e2 Mon Sep 17 00:00:00 2001
From: Jan Boon <jan.boon@kaetemi.be>
Date: Mon, 9 Feb 2026 15:13:56 +0000
Subject: [PATCH 25/25] sampling : add rng test case

---
 tests/test-sampling.cpp | 74 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 7cd96c5cd3..1a04ac5b11 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -192,6 +192,73 @@ static void test_top_n_sigma(const std::vector<float> & probs, const std::vector
     tester.check();
 }
 
+static void test_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type,
+                          const std::vector<llama_token> & expected, const char * desc) {
+    const int n_vocab   = 16;
+    const int n_samples = 32;
+
+    // fixed non-uniform distribution: token i has logit log(i+1)
+    std::vector<llama_token_data> data(n_vocab);
+    for (int i = 0; i < n_vocab; i++) {
+        data[i] = {i, logf((float)(i + 1)), 0.0f};
+    }
+
+    auto * sampler = llama_sampler_init_dist_rng(seed, blue_noise, rng_type);
+    std::vector<llama_token> tokens(n_samples);
+
+    for (int i = 0; i < n_samples; i++) {
+        std::vector<llama_token_data> cur(data);
+        llama_token_data_array cur_p = {cur.data(), cur.size(), -1, false};
+        llama_sampler_apply(sampler, &cur_p);
+        GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (llama_token)n_vocab);
+        tokens[i] = cur_p.data[cur_p.selected].id;
+    }
+
+    if (expected.empty()) {
+        // print sequence for capture
+        printf("test_dist_rng %s: {", desc);
+        for (int i = 0; i < n_samples; i++) {
+            printf("%s%d", i ? ", " : "", tokens[i]);
+        }
+        printf("}\n");
+    } else {
+        // verify against known sequence
+        GGML_ASSERT((int)expected.size() == n_samples);
+        bool match = true;
+        for (int i = 0; i < n_samples; i++) {
+            if (tokens[i] != expected[i]) {
+                match = false;
+                break;
+            }
+        }
+        if (!match) {
+            printf("test_dist_rng %s: MISMATCH\n  got:      {", desc);
+            for (int i = 0; i < n_samples; i++) {
+                printf("%s%d", i ? ", " : "", tokens[i]);
+            }
+            printf("}\n  expected: {");
+            for (int i = 0; i < n_samples; i++) {
+                printf("%s%d", i ? ", " : "", expected[i]);
+            }
+            printf("}\n");
+            GGML_ASSERT(false);
+        }
+
+        // also verify reset reproduces same sequence
+        llama_sampler_reset(sampler);
+        for (int i = 0; i < n_samples; i++) {
+            std::vector<llama_token_data> cur(data);
+            llama_token_data_array cur_p = {cur.data(), cur.size(), -1, false};
+            llama_sampler_apply(sampler, &cur_p);
+            GGML_ASSERT(cur_p.data[cur_p.selected].id == tokens[i]);
+        }
+
+        printf("test_dist_rng %-30s OK\n", desc);
+    }
+
+    llama_sampler_free(sampler);
+}
+
 static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
     sampler_tester tester(n_vocab);
@@ -392,6 +459,13 @@ int main(void) {
     test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
     test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
 
+    test_dist_rng(42, false, LLAMA_RNG_TYPE_LOWBIAS32,
+        {5, 12, 8, 10, 12, 11, 10, 8, 8, 10, 11, 9, 7, 6, 11, 13, 14, 15, 13, 4, 12, 14, 13, 13, 14, 12, 5, 15, 4, 13, 15, 12},
+        "lowbias32");
+    test_dist_rng(42, true,  LLAMA_RNG_TYPE_LOWBIAS32,
+        {10, 5, 12, 8, 15, 13, 3, 10, 13, 12, 2, 15, 8, 14, 5, 11, 7, 9, 15, 11, 8, 2, 12, 14, 7, 9, 13, 10, 14, 5, 12, 15},
+        "lowbias32 + blue noise");
+
     printf("OK\n");
 
     test_perf();