From f17b3be63fd9dfa3d12ad3555c977a6430661769 Mon Sep 17 00:00:00 2001 From: ZeroV0LT Date: Fri, 13 Mar 2026 19:53:42 +0100 Subject: [PATCH] llama : fix pooling assertion crash in chunked GDN detection path (#20468) * llama : fix pooling assertion crash in chunked GDN detection path The chunked fused Gated Delta Net detection in sched_reserve() calls graph_reserve(16*n_seqs, n_seqs, n_outputs, ...) where n_outputs = n_seqs. This creates a dimension mismatch in build_pooling() for embedding models with mean/rank pooling: build_inp_mean() creates a tensor with shape [n_tokens=16*n_seqs, ...] while t_embd is reduced to [n_outputs=n_seqs, ...] via out_ids, causing ggml_mul_mat to assert on ggml_can_mul_mat(a, b). Fix: pass n_tokens as n_outputs in the chunked GDN graph reservation, matching the pattern used by the pp/tg worst-case reservations. Regression introduced by #20340 (d28961d). Same class of bug as #12517, fixed by #12545. * server : add mean pooling tests to embedding test suite Add test_embedding_pooling_mean and test_embedding_pooling_mean_multiple to cover the --pooling mean codepath, which was previously untested. These tests would have caught the regression introduced by #20340 where build_pooling() crashes with a ggml_mul_mat assertion due to mismatched dimensions in the chunked GDN detection path. --------- Co-authored-by: Domenico Crupi --- src/llama-context.cpp | 7 ++++- tools/server/tests/unit/test_embedding.py | 34 +++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 14dccac5b5..1f7a52d789 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -512,7 +512,12 @@ void llama_context::sched_reserve() { if (cparams.fused_gdn_ch) { // more than one token in the batch per sequence in order to take the chunked path - auto * gf = graph_reserve(16*n_seqs, n_seqs, n_outputs, mctx.get(), true); + // note: n_outputs must match n_tokens for embedding models with mean/rank pooling, + // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies + // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens, + // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553). + const uint32_t n_tokens_ch = 16*n_seqs; + auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true); if (!gf) { throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)"); } diff --git a/tools/server/tests/unit/test_embedding.py b/tools/server/tests/unit/test_embedding.py index 50601b8396..17ba09554b 100644 --- a/tools/server/tests/unit/test_embedding.py +++ b/tools/server/tests/unit/test_embedding.py @@ -101,6 +101,40 @@ def test_embedding_mixed_input(input, is_multi_prompt: bool): assert len(data[0]['embedding']) > 1 +def test_embedding_pooling_mean(): + global server + server.pooling = 'mean' + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": "I believe the meaning of life is", + }) + assert res.status_code == 200 + assert len(res.body['data']) == 1 + assert 'embedding' in res.body['data'][0] + assert len(res.body['data'][0]['embedding']) > 1 + + # make sure embedding vector is normalized + assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON + + +def test_embedding_pooling_mean_multiple(): + global server + server.pooling = 'mean' + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": [ + "I believe the meaning of life is", + "Write a joke about AI", + "This is a test", + ], + }) + assert res.status_code == 200 + assert len(res.body['data']) == 3 + for d in res.body['data']: + assert 'embedding' in d + assert len(d['embedding']) > 1 + + def test_embedding_pooling_none(): global server server.pooling = 'none'