From b47e0cf21aba51c96f69e05ae97a9bfdc53faa5f Mon Sep 17 00:00:00 2001
From: Kai Gerd Schwebke <kai@schwebke.com>
Date: Sun, 1 Mar 2026 13:25:09 +0100
Subject: [PATCH] server: add Qwen3-Reranker instruction support

---
 convert_hf_to_gguf.py           | 20 ++++++++++++++------
 tools/server/server-common.cpp  | 11 ++++++++++-
 tools/server/server-common.h    |  1 +
 tools/server/server-context.cpp | 11 ++++++++++-
 4 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 0954417398..9616e6427d 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4425,12 +4425,20 @@ class Qwen3Model(Qwen2Model):
         if self.is_rerank:
             self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
             self.gguf_writer.add_classifier_output_labels(["yes", "no"])
-            self.gguf_writer.add_chat_template([{
-                "name": "rerank",
-                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
-                            "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
-                            "<|im_start|>assistant\n<think>\n\n</think>\n\n"
-            }])
+            self.gguf_writer.add_chat_template([
+                {
+                    "name": "rerank",
+                    "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
+                                "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
+                                "<|im_start|>assistant\n<think>\n\n</think>\n\n",
+                },
+                {
+                    "name": "rerank_instruct",
+                    "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
+                                "<|im_start|>user\n<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
+                                "<|im_start|>assistant\n<think>\n\n</think>\n\n",
+                },
+            ])
 
     def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
         # extract "yes" and "no" tokens from the output lm_head tensor
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index ff3c6d3c2b..cbc8c96bbe 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1999,13 +1999,22 @@ server_tokens format_prompt_rerank(
         const struct llama_model * model,
         const struct llama_vocab * vocab,
         mtmd_context * mctx,
+        const std::string & instruction,
         const std::string & query,
         const std::string & doc) {
     server_tokens result = {};
 
     const char * rerank_prompt = llama_model_chat_template(model, "rerank");
+    const char * rerank_prompt_instruct = llama_model_chat_template(model, "rerank_instruct");
 
-    if (rerank_prompt != nullptr) {
+    if ( (rerank_prompt_instruct != nullptr) && !instruction.empty() ) {
+        std::string prompt = rerank_prompt_instruct;
+        string_replace_all(prompt, "{instruction}", instruction);
+        string_replace_all(prompt, "{query}"      , query);
+        string_replace_all(prompt, "{document}"   , doc  );
+        server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
+        result.push_back(tokens);
+    } else if (rerank_prompt != nullptr) {
         std::string prompt = rerank_prompt;
         string_replace_all(prompt, "{query}"   , query);
         string_replace_all(prompt, "{document}", doc  );
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 4fb9e488df..2e455a0eef 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -367,5 +367,6 @@ server_tokens format_prompt_rerank(
         const struct llama_model * model,
         const struct llama_vocab * vocab,
         mtmd_context * mctx,
+        const std::string & instruction,
         const std::string & query,
         const std::string & doc);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index aafed49502..ea00004cba 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3803,6 +3803,15 @@ void server_routes::init_routes() {
             res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
+        
+        json instruction = "";
+        if (body.count("instruction") == 1) {
+            instruction = body.at("instruction");
+            if (!instruction.is_string()) {
+                res->error(format_error_response("\"instruction\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
 
         std::vector<std::string> documents = json_value(body, "documents",
                                              json_value(body, "texts", std::vector<std::string>()));
@@ -3820,7 +3829,7 @@ void server_routes::init_routes() {
             std::vector<server_task> tasks;
             tasks.reserve(documents.size());
             for (size_t i = 0; i < documents.size(); i++) {
-                auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
+                auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, instruction, query, documents[i]);
                 server_task task = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id     = rd.get_new_id();
                 task.tokens = std::move(tmp);