From b47e0cf21aba51c96f69e05ae97a9bfdc53faa5f Mon Sep 17 00:00:00 2001 From: Kai Gerd Schwebke Date: Sun, 1 Mar 2026 13:25:09 +0100 Subject: [PATCH] server: add Qwen3-Reranker instruction support --- convert_hf_to_gguf.py | 20 ++++++++++++++------ tools/server/server-common.cpp | 11 ++++++++++- tools/server/server-common.h | 1 + tools/server/server-context.cpp | 11 ++++++++++- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0954417398..9616e6427d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4425,12 +4425,20 @@ class Qwen3Model(Qwen2Model): if self.is_rerank: self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) self.gguf_writer.add_classifier_output_labels(["yes", "no"]) - self.gguf_writer.add_chat_template([{ - "name": "rerank", - "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" - "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" - "<|im_start|>assistant\n\n\n\n\n" - }]) + self.gguf_writer.add_chat_template([ + { + "name": "rerank", + "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" + "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" + "<|im_start|>assistant\n\n\n\n\n", + }, + { + "name": "rerank_instruct", + "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" + "<|im_start|>user\n: {instruction}\n: {query}\n: {document}<|im_end|>\n" + "<|im_start|>assistant\n\n\n\n\n", + }, + ]) def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: # extract "yes" and "no" tokens from the output lm_head tensor diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ff3c6d3c2b..cbc8c96bbe 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1999,13 +1999,22 @@ server_tokens format_prompt_rerank( const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, + const std::string & instruction, const std::string & query, const std::string & doc) { server_tokens result = {}; const char * rerank_prompt = llama_model_chat_template(model, "rerank"); + const char * rerank_prompt_instruct = llama_model_chat_template(model, "rerank_instruct"); - if (rerank_prompt != nullptr) { + if ( (rerank_prompt_instruct != nullptr) && !instruction.empty() ) { + std::string prompt = rerank_prompt_instruct; + string_replace_all(prompt, "{instruction}", instruction); + string_replace_all(prompt, "{query}" , query); + string_replace_all(prompt, "{document}" , doc ); + server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true); + result.push_back(tokens); + } else if (rerank_prompt != nullptr) { std::string prompt = rerank_prompt; string_replace_all(prompt, "{query}" , query); string_replace_all(prompt, "{document}", doc ); diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 4fb9e488df..2e455a0eef 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -367,5 +367,6 @@ server_tokens format_prompt_rerank( const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, + const std::string & instruction, const std::string & query, const std::string & doc); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index aafed49502..ea00004cba 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3803,6 +3803,15 @@ void server_routes::init_routes() { res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); return res; } + + json instruction = ""; + if (body.count("instruction") == 1) { + instruction = body.at("instruction"); + if (!instruction.is_string()) { + res->error(format_error_response("\"instruction\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } std::vector documents = json_value(body, "documents", json_value(body, "texts", std::vector())); @@ -3820,7 +3829,7 @@ void server_routes::init_routes() { std::vector tasks; tasks.reserve(documents.size()); for (size_t i = 0; i < documents.size(); i++) { - auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); + auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, instruction, query, documents[i]); server_task task = server_task(SERVER_TASK_TYPE_RERANK); task.id = rd.get_new_id(); task.tokens = std::move(tmp);