Merge branch 'ggml-org:master' into master

2026-03-13 14:17:27 -05:00 · 2026-03-13 14:17:27 -05:00 · 302089bd2d
parent fe33dbe2fb f17b3be63f
commit 302089bd2d
5 changed files with 1464 additions and 546 deletions
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -199,13 +199,6 @@
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
-#define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
-#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
-#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
-#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
-#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -512,7 +512,12 @@ void llama_context::sched_reserve() {

        if (cparams.fused_gdn_ch) {
            // more than one token in the batch per sequence in order to take the chunked path
-            auto * gf = graph_reserve(16*n_seqs, n_seqs, n_outputs, mctx.get(), true);
+            // note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
+            // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
+            // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
+            // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
+            const uint32_t n_tokens_ch = 16*n_seqs;
+            auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
            if (!gf) {
                throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
            }
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1189,6 +1189,9 @@ private:
            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
            : SLOT_STATE_STARTED;

+        // reset server kill-switch counter
+        n_empty_consecutive = 0;
+
        SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
        return true;
    }
--- a/tools/server/tests/unit/test_embedding.py
+++ b/tools/server/tests/unit/test_embedding.py
@ -101,6 +101,40 @@ def test_embedding_mixed_input(input, is_multi_prompt: bool):
        assert len(data[0]['embedding']) > 1


+def test_embedding_pooling_mean():
+    global server
+    server.pooling = 'mean'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_pooling_mean_multiple():
+    global server
+    server.pooling = 'mean'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI",
+            "This is a test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 3
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
 def test_embedding_pooling_none():
    global server
    server.pooling = 'none'