server: improve Responses API compliance and Codex CLI compatibility

Codex CLI compatibility: - Skip non-function tool types (web_search, code_interpreter) - Merge developer/system messages into position 0 for Qwen templates - Strip Responses-only request keys (store, include, prompt_cache_key) - output_text convenience field in streaming and non-streaming responses Responses API compliance (ideas from #19720 by riskywindow, adapted): - Add 24 missing Response object fields per OpenAI spec - Fix function_call id/call_id field mapping - Add sequence_number, output_index, content_index to streaming events - Accept input_text type and EasyInputMessage for multi-turn input Verified: codex -p local and codex -p fast work against local llama.cpp with Qwen3.5 models including native tool calling. Refs: ggml-org/llama.cpp#19138, ggml-org/llama.cpp#19720
2026-03-30 09:39:59 +02:00 · 2026-03-30 09:39:59 +02:00 · 302c3c8f61
parent 7c203670f8
commit 302c3c8f61
3 changed files with 192 additions and 71 deletions
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1255,6 +1255,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                if (item.contains("status")) {
                    item.erase("status");
                }
                // Merge system/developer messages into the first system message.
                // Many model templates (e.g. Qwen) require all system content at
                // position 0 and reject system messages elsewhere in the conversation.
                if (item.at("role") == "system" || item.at("role") == "developer") {
                    if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") {
                        auto & first_msg = chatcmpl_messages[0];
                        // Convert string content to array format if needed
                        if (first_msg["content"].is_string()) {
                            std::string old_text = first_msg["content"].get<std::string>();
                            first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}});
                        }
                        auto & first_content = first_msg["content"];
                        for (const auto & part : chatcmpl_content) {
                            first_content.push_back(part);
                        }
                        continue; // merged, don't push a separate message
                    }
                    item["role"] = "system";
                }
                item["content"] = chatcmpl_content;
                chatcmpl_messages.push_back(item);
@ -1266,35 +1285,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                //     item.at("status") == "completed" ||
                //     item.at("status") == "incomplete") &&
                // item["status"] not sent by codex-cli
-                exists_and_is_string(item, "type") &&
+                // item["type"] == "message" for OutputMessage, absent for EasyInputMessage
-                item.at("type") == "message"
+                (!item.contains("type") || item.at("type") == "message")
            ) {
                // #responses_create-input-input_item_list-item-output_message
-                auto chatcmpl_content = json::array();
+                // Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant"
                std::vector<json> chatcmpl_content;
                for (const auto & output_text : item.at("content")) {
                    const std::string type = json_value(output_text, "type", std::string());
-                    if (type == "output_text") {
+                    if (type != "output_text" && type != "input_text") {
-                        if (!exists_and_is_string(output_text, "text")) {
+                        throw std::invalid_argument("'type' must be 'output_text' or 'input_text'");
                            throw std::invalid_argument("'Output text' requires 'text'");
                            // Ignore annotations and logprobs for now
                            chatcmpl_content.push_back({
                                {"text", output_text.at("text")},
                                {"type", "text"},
                            });
                        }
                    } else if (type == "refusal") {
                        if (!exists_and_is_string(output_text, "refusal")) {
                            throw std::invalid_argument("'Refusal' requires 'refusal'");
                            // Ignore annotations and logprobs for now
                            chatcmpl_content.push_back({
                                {"refusal", output_text.at("refusal")},
                                {"type", "refusal"},
                            });
                        }
                    } else {
                        throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
                    }
                    if (!exists_and_is_string(output_text, "text")) {
                        throw std::invalid_argument("'Output text' requires 'text'");
                    }
                    chatcmpl_content.push_back({
                        {"text", output_text.at("text")},
                        {"type", "text"},
                    });
                }
                if (merge_prev) {
@ -1303,7 +1312,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                        prev_msg["content"] = json::array();
                    }
                    auto & prev_content = prev_msg["content"];
-                    prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
+                    for (const auto & part : chatcmpl_content) {
                        prev_content.push_back(part);
                    }
                } else {
                    item.erase("status");
                    item.erase("type");
@ -1407,11 +1418,17 @@ json convert_responses_to_chatcmpl(const json & response_body) {
        }
        std::vector<json> chatcmpl_tools;
        for (json resp_tool : response_body.at("tools")) {
-            json chatcmpl_tool;
+            const std::string tool_type = json_value(resp_tool, "type", std::string());
-            if (json_value(resp_tool, "type", std::string()) != "function") {
+            // Skip non-function tools (e.g. web_search, code_interpreter)
-                throw std::invalid_argument("'type' of tool must be 'function'");
+            // sent by clients like Codex CLI — these are provider-specific
            // and cannot be converted to chat completions function tools
            if (tool_type != "function") {
                SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str());
                continue;
            }
            json chatcmpl_tool;
            resp_tool.erase("type");
            chatcmpl_tool["type"] = "function";
@ -1422,7 +1439,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
            chatcmpl_tools.push_back(chatcmpl_tool);
        }
        chatcmpl_body.erase("tools");
-        chatcmpl_body["tools"] = chatcmpl_tools;
+        if (!chatcmpl_tools.empty()) {
            chatcmpl_body["tools"] = chatcmpl_tools;
        }
    }
    if (response_body.contains("max_output_tokens")) {
@ -1430,6 +1449,15 @@ json convert_responses_to_chatcmpl(const json & response_body) {
        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
    }
    // Strip Responses-only keys that have no chat completions equivalent
    // (e.g. Codex CLI sends store, include, prompt_cache_key, web_search)
    for (const char * key : {
        "store", "include", "prompt_cache_key", "web_search",
        "text", "truncation", "metadata",
    }) {
        chatcmpl_body.erase(key);
    }
    return chatcmpl_body;
 }
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -960,28 +960,66 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
        output.push_back(json {
            {"type",      "function_call"},
-            {"status",    "completed"},
+            {"id",        "fc_" + random_string()},
-            {"arguments", tool_call.arguments},
+            {"call_id",   tool_call.id},
            {"call_id",   "fc_" + tool_call.id},
            {"name",      tool_call.name},
            {"arguments", tool_call.arguments},
            {"status",    "completed"},
        });
    }
    // Build output_text convenience field (concatenation of all output_text parts)
    std::string output_text;
    for (const auto & item : output) {
        if (json_value(item, "type", std::string()) == "message") {
            for (const auto & part : item.at("content")) {
                if (json_value(part, "type", std::string()) == "output_text") {
                    output_text += part.at("text").get<std::string>();
                }
            }
        }
    }
    std::time_t t = std::time(0);
    json res = {
-        {"completed_at", t},
+        {"completed_at",         t},
-        {"created_at",   t},
+        {"created_at",           t},
-        {"id",           oai_resp_id},
+        {"id",                   oai_resp_id},
-        {"model",        oaicompat_model},
+        {"model",                oaicompat_model},
-        {"object",       "response"},
+        {"object",               "response"},
-        {"output",       output},
+        {"output",               output},
-        {"status",       "completed"},
+        {"output_text",          output_text},
-        {"usage",        json {
+        {"status",               "completed"},
-            {"input_tokens",  n_prompt_tokens},
+        {"usage",                json {
-            {"output_tokens", n_decoded},
+            {"input_tokens",          n_prompt_tokens},
-            {"total_tokens",  n_decoded + n_prompt_tokens},
+            {"output_tokens",         n_decoded},
-            {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
+            {"total_tokens",          n_decoded + n_prompt_tokens},
            {"input_tokens_details",  json{{"cached_tokens", n_prompt_tokens_cache}}},
            {"output_tokens_details", json{{"reasoning_tokens", 0}}},
        }},
        {"incomplete_details",   nullptr},
        {"previous_response_id", nullptr},
        {"instructions",         nullptr},
        {"error",                nullptr},
        {"tools",                json::array()},
        {"tool_choice",          "auto"},
        {"truncation",           "disabled"},
        {"parallel_tool_calls",  false},
        {"text",                 json{{"format", json{{"type", "text"}}}}},
        {"top_p",                1.0},
        {"presence_penalty",     0.0},
        {"frequency_penalty",    0.0},
        {"top_logprobs",         0},
        {"temperature",          1.0},
        {"reasoning",            nullptr},
        {"max_output_tokens",    nullptr},
        {"max_tool_calls",       nullptr},
        {"store",                false},
        {"background",           false},
        {"service_tier",         "default"},
        {"safety_identifier",    nullptr},
        {"prompt_cache_key",     nullptr},
        {"metadata",             json::object()},
    };
    return res;
@ -990,6 +1028,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
 json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
    std::vector<json> server_sent_events;
    std::vector<json> output;
    int & seq_num = oai_resp_seq_num;
    if (oaicompat_msg.reasoning_content != "") {
        const json output_item = json {
@ -1006,8 +1045,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
        server_sent_events.push_back(json {
            {"event", "response.output_item.done"},
            {"data", json {
-                {"type", "response.output_item.done"},
+                {"type",            "response.output_item.done"},
-                {"item", output_item}
+                {"sequence_number", seq_num++},
                {"output_index",    0},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
@ -1017,9 +1058,13 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
        server_sent_events.push_back(json {
            {"event", "response.output_text.done"},
            {"data", json {
-                {"type",    "response.output_text.done"},
+                {"type",            "response.output_text.done"},
-                {"item_id", oai_resp_message_id},
+                {"sequence_number", seq_num++},
-                {"text",    oaicompat_msg.content}
+                {"output_index",    0},
                {"content_index",   0},
                {"item_id",         oai_resp_message_id},
                {"text",            oaicompat_msg.content},
                {"logprobs",        json::array()},
            }}
        });
@ -1033,9 +1078,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
        server_sent_events.push_back(json {
            {"event", "response.content_part.done"},
            {"data", json {
-                {"type",    "response.content_part.done"},
+                {"type",            "response.content_part.done"},
-                {"item_id", oai_resp_message_id},
+                {"sequence_number", seq_num++},
-                {"part",    content_part}
+                {"output_index",    0},
                {"content_index",   0},
                {"item_id",         oai_resp_message_id},
                {"part",            content_part},
            }}
        });
        const json output_item = {
@ -1049,8 +1097,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
        server_sent_events.push_back(json {
            {"event", "response.output_item.done"},
            {"data", json {
-                {"type", "response.output_item.done"},
+                {"type",            "response.output_item.done"},
-                {"item", output_item}
+                {"sequence_number", seq_num++},
                {"output_index",    0},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
@ -1059,39 +1109,81 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
        const json output_item = {
            {"type",      "function_call"},
-            {"status",    "completed"},
+            {"id",        "fc_" + random_string()},
            {"call_id",   tool_call.id},
            {"name",      tool_call.name},
            {"arguments", tool_call.arguments},
-            {"call_id",   "fc_" + tool_call.id},
+            {"status",    "completed"},
            {"name",      tool_call.name}
        };
        server_sent_events.push_back(json {
            {"event", "response.output_item.done"},
            {"data", json {
-                {"type", "response.output_item.done"},
+                {"type",            "response.output_item.done"},
-                {"item", output_item}
+                {"sequence_number", seq_num++},
                {"output_index",    0},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
    }
    // Build output_text convenience field for streaming final event
    std::string output_text_stream;
    for (const auto & item : output) {
        if (json_value(item, "type", std::string()) == "message") {
            for (const auto & part : item.at("content")) {
                if (json_value(part, "type", std::string()) == "output_text") {
                    output_text_stream += part.at("text").get<std::string>();
                }
            }
        }
    }
    std::time_t t = std::time(0);
    server_sent_events.push_back(json {
        {"event", "response.completed"},
        {"data", json {
-            {"type", "response.completed"},
+            {"type",            "response.completed"},
            {"sequence_number", seq_num++},
            {"response", json {
-                {"id",         oai_resp_id},
+                {"completed_at",         t},
-                {"object",     "response"},
+                {"created_at",           t},
-                {"created_at", t},
+                {"id",                   oai_resp_id},
-                {"status",     "completed"},
+                {"object",               "response"},
-                {"model",      oaicompat_model},
+                {"status",               "completed"},
-                {"output",     output},
+                {"model",                oaicompat_model},
-                {"usage",      json {
+                {"output",               output},
-                    {"input_tokens",  n_prompt_tokens},
+                {"output_text",          output_text_stream},
-                    {"output_tokens", n_decoded},
+                {"usage",                json {
-                    {"total_tokens",  n_decoded + n_prompt_tokens},
+                    {"input_tokens",          n_prompt_tokens},
-                    {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
+                    {"output_tokens",         n_decoded},
-                }}
+                    {"total_tokens",          n_decoded + n_prompt_tokens},
                    {"input_tokens_details",  json{{"cached_tokens", n_prompt_tokens_cache}}},
                    {"output_tokens_details", json{{"reasoning_tokens", 0}}},
                }},
                {"incomplete_details",   nullptr},
                {"previous_response_id", nullptr},
                {"instructions",         nullptr},
                {"error",                nullptr},
                {"tools",                json::array()},
                {"tool_choice",          "auto"},
                {"truncation",           "disabled"},
                {"parallel_tool_calls",  false},
                {"text",                 json{{"format", json{{"type", "text"}}}}},
                {"top_p",                1.0},
                {"presence_penalty",     0.0},
                {"frequency_penalty",    0.0},
                {"top_logprobs",         0},
                {"temperature",          1.0},
                {"reasoning",            nullptr},
                {"max_output_tokens",    nullptr},
                {"max_tool_calls",       nullptr},
                {"store",                false},
                {"background",           false},
                {"service_tier",         "default"},
                {"safety_identifier",    nullptr},
                {"prompt_cache_key",     nullptr},
                {"metadata",             json::object()},
            }},
        }}
    });
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -370,6 +370,7 @@ struct server_task_result_cmpl_final : server_task_result {
    std::string oai_resp_id;
    std::string oai_resp_reasoning_id;
    std::string oai_resp_message_id;
    int         oai_resp_seq_num = 0;
    virtual bool is_stop() override {
        return true; // in stream mode, final responses are considered stop