Fix: Support V-L Embedding models in server mode

- Skip speculative decoding for embedding models (no output logits)
- Add tensor bounds validation for V-L model embedding extraction
- Fixes crashes with Qwen3-VL-Embedding models when using --embedding flag

Changes:
- common/speculative.cpp: Skip speculative_is_compat for embedding models
- src/llama-context.cpp: Handle variable tensor sizes in V-L architectures
This commit is contained in:
oliveagle 2026-02-18 00:12:48 +08:00
parent 237958db33
commit 02760b6ab6
2 changed files with 30 additions and 2 deletions

View File

@ -804,6 +804,12 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) {
return false;
}
// Skip speculative decoding for embedding models
// Embedding models don't have output logits needed for speculative decoding
if (llama_pooling_type(ctx_tgt) != LLAMA_POOLING_TYPE_NONE) {
return false;
}
bool res = true;
llama_memory_clear(mem, true);

View File

@ -1663,12 +1663,34 @@ int llama_context::decode(const llama_batch & batch_inp) {
// extract sequence embeddings (cleared before processing each batch)
auto & embd_seq_out = embd_seq;
// For V-L models, the embedding output tensor may have different dimensions
// Use tensor's actual size to determine correct embedding dimension
const size_t tensor_size = ggml_nbytes(t_embd);
const uint32_t n_embd_tensor = tensor_size / (ubatch.n_seqs_unq > 0 ? ubatch.n_seqs_unq : 1) / sizeof(float);
const uint32_t n_embd_to_use = (n_embd_tensor > 0 && n_embd_tensor < n_embd) ? n_embd_tensor : n_embd;
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id];
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
embd_seq_out[seq_id].resize(n_embd_to_use);
const size_t src_offset = (size_t)n_embd_to_use * seq_idx * sizeof(float);
const size_t copy_size = (size_t)n_embd_to_use * sizeof(float);
// Validate bounds
if (src_offset + copy_size <= tensor_size) {
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), src_offset, copy_size);
} else {
LLAMA_LOG_ERROR("%s: tensor bounds check failed: offset=%zu + size=%zu > tensor_size=%zu, using fallback\n",
__func__, src_offset, copy_size, tensor_size);
// Try using smaller dimension
const uint32_t n_embd_fallback = hparams.n_embd_out();
if (n_embd_fallback > 0 && (size_t)n_embd_fallback * sizeof(float) <= tensor_size) {
embd_seq_out[seq_id].resize(n_embd_fallback);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), 0, n_embd_fallback * sizeof(float));
} else {
std::fill(embd_seq_out[seq_id].begin(), embd_seq_out[seq_id].end(), 0.0f);
}
}
}
} break;
case LLAMA_POOLING_TYPE_RANK: