From 02760b6ab654358f5815951b5b1143e78c126e3b Mon Sep 17 00:00:00 2001 From: oliveagle Date: Wed, 18 Feb 2026 00:12:48 +0800 Subject: [PATCH 1/2] Fix: Support V-L Embedding models in server mode - Skip speculative decoding for embedding models (no output logits) - Add tensor bounds validation for V-L model embedding extraction - Fixes crashes with Qwen3-VL-Embedding models when using --embedding flag Changes: - common/speculative.cpp: Skip speculative_is_compat for embedding models - src/llama-context.cpp: Handle variable tensor sizes in V-L architectures --- common/speculative.cpp | 6 ++++++ src/llama-context.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3e68c38e49..6cb7c19623 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -804,6 +804,12 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) { return false; } + // Skip speculative decoding for embedding models + // Embedding models don't have output logits needed for speculative decoding + if (llama_pooling_type(ctx_tgt) != LLAMA_POOLING_TYPE_NONE) { + return false; + } + bool res = true; llama_memory_clear(mem, true); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7cd0bfc0d2..2d8da83fb8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1663,12 +1663,34 @@ int llama_context::decode(const llama_batch & batch_inp) { // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = embd_seq; + // For V-L models, the embedding output tensor may have different dimensions + // Use tensor's actual size to determine correct embedding dimension + const size_t tensor_size = ggml_nbytes(t_embd); + const uint32_t n_embd_tensor = tensor_size / (ubatch.n_seqs_unq > 0 ? ubatch.n_seqs_unq : 1) / sizeof(float); + const uint32_t n_embd_to_use = (n_embd_tensor > 0 && n_embd_tensor < n_embd) ? n_embd_tensor : n_embd; + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); + embd_seq_out[seq_id].resize(n_embd_to_use); + const size_t src_offset = (size_t)n_embd_to_use * seq_idx * sizeof(float); + const size_t copy_size = (size_t)n_embd_to_use * sizeof(float); + // Validate bounds + if (src_offset + copy_size <= tensor_size) { + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), src_offset, copy_size); + } else { + LLAMA_LOG_ERROR("%s: tensor bounds check failed: offset=%zu + size=%zu > tensor_size=%zu, using fallback\n", + __func__, src_offset, copy_size, tensor_size); + // Try using smaller dimension + const uint32_t n_embd_fallback = hparams.n_embd_out(); + if (n_embd_fallback > 0 && (size_t)n_embd_fallback * sizeof(float) <= tensor_size) { + embd_seq_out[seq_id].resize(n_embd_fallback); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), 0, n_embd_fallback * sizeof(float)); + } else { + std::fill(embd_seq_out[seq_id].begin(), embd_seq_out[seq_id].end(), 0.0f); + } + } } } break; case LLAMA_POOLING_TYPE_RANK: From 28be3617898e27fa1d229aa57dad4c13059f6d04 Mon Sep 17 00:00:00 2001 From: oliveagle Date: Thu, 19 Feb 2026 23:42:35 +0800 Subject: [PATCH 2/2] server : simplify V-L embedding dimension handling - Replace fragile fallback logic with direct tensor dimension access - Use t_embd->ne[0] to get actual embedding dimension from tensor shape - Remove complex bounds checking and fallback paths - Addresses review feedback from ngxson Co-Authored-By: Claude Opus 4.6 --- src/llama-context.cpp | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2d8da83fb8..c39cfcef52 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1664,33 +1664,19 @@ int llama_context::decode(const llama_batch & batch_inp) { auto & embd_seq_out = embd_seq; // For V-L models, the embedding output tensor may have different dimensions - // Use tensor's actual size to determine correct embedding dimension - const size_t tensor_size = ggml_nbytes(t_embd); - const uint32_t n_embd_tensor = tensor_size / (ubatch.n_seqs_unq > 0 ? ubatch.n_seqs_unq : 1) / sizeof(float); - const uint32_t n_embd_to_use = (n_embd_tensor > 0 && n_embd_tensor < n_embd) ? n_embd_tensor : n_embd; + // The embedding dimension is determined by the tensor shape (ne[0]), not by model hparams + const uint32_t n_embd_tensor = t_embd->ne[0]; + + // Use the tensor's embedding dimension if valid, otherwise fall back to model dimension + const uint32_t n_embd_to_use = n_embd_tensor > 0 ? n_embd_tensor : n_embd; for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; embd_seq_out[seq_id].resize(n_embd_to_use); - const size_t src_offset = (size_t)n_embd_to_use * seq_idx * sizeof(float); - const size_t copy_size = (size_t)n_embd_to_use * sizeof(float); - // Validate bounds - if (src_offset + copy_size <= tensor_size) { - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), src_offset, copy_size); - } else { - LLAMA_LOG_ERROR("%s: tensor bounds check failed: offset=%zu + size=%zu > tensor_size=%zu, using fallback\n", - __func__, src_offset, copy_size, tensor_size); - // Try using smaller dimension - const uint32_t n_embd_fallback = hparams.n_embd_out(); - if (n_embd_fallback > 0 && (size_t)n_embd_fallback * sizeof(float) <= tensor_size) { - embd_seq_out[seq_id].resize(n_embd_fallback); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), 0, n_embd_fallback * sizeof(float)); - } else { - std::fill(embd_seq_out[seq_id].begin(), embd_seq_out[seq_id].end(), 0.0f); - } - } + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), + (n_embd_to_use*seq_idx)*sizeof(float), n_embd_to_use*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_RANK: