From 4d99d45084f33574e5924ad8457344bfa36788b9 Mon Sep 17 00:00:00 2001 From: Vinicios Lugli Date: Tue, 10 Mar 2026 19:40:14 -0300 Subject: [PATCH] model : qwen3vl reranker text support (#20332) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * model : fix qwen3vl reranker support * Remove CLS_OUT Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++---- src/llama-arch.cpp | 1 + src/llama-graph.cpp | 4 ++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 083b5bca9e..30347f7389 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4390,15 +4390,31 @@ class Qwen3Model(Qwen2Model): hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) self.origin_hf_arch = hparams.get('architectures', [None])[0] - # a bit hacky, but currently the only way to detect if this is a rerank model - # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B + if self._is_qwen3_reranker(): + self._find_rerank_config() + + def _is_qwen3_reranker(self) -> bool: readme_path = self.dir_model / "README.md" readme_text = "" if readme_path.exists(): with readme_path.open("r", encoding="utf-8") as f: readme_text = f.read() - if "# Qwen3-Reranker" in readme_text: - self._find_rerank_config() + + name_hints = [ + str(self.dir_model.name), + str(self.hparams.get("_name_or_path", "")), + str(self.hparams.get("model_type", "")), + str(self.origin_hf_arch or ""), + ] + name_hints = [hint.lower() for hint in name_hints if hint] + + if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower(): + return True + + if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints): + return True + + return "sequenceclassification" in (self.origin_hf_arch or "").lower() def set_vocab(self): # deal with intern-s1-mini diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ce49bbd988..204105b6dd 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1087,6 +1087,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_CLS_OUT, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_Q_NORM, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 5f875136a1..528f8e5458 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -250,7 +250,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { const bool last = ( cparams.pooling_type == LLAMA_POOLING_TYPE_LAST || - (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token + (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token ); for (int i = 0; i < n_tokens; ++i) { @@ -2552,7 +2552,7 @@ void llm_graph_context::build_pooling( } // softmax for qwen3 reranker - if (arch == LLM_ARCH_QWEN3) { + if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) { cur = ggml_soft_max(ctx0, cur); } } break;