From 302c3c8f61f8a3c33b54c8da2c9ee79d4df0eb6c Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Mon, 30 Mar 2026 09:39:59 +0200 Subject: [PATCH 1/7] server: improve Responses API compliance and Codex CLI compatibility Codex CLI compatibility: - Skip non-function tool types (web_search, code_interpreter) - Merge developer/system messages into position 0 for Qwen templates - Strip Responses-only request keys (store, include, prompt_cache_key) - output_text convenience field in streaming and non-streaming responses Responses API compliance (ideas from #19720 by riskywindow, adapted): - Add 24 missing Response object fields per OpenAI spec - Fix function_call id/call_id field mapping - Add sequence_number, output_index, content_index to streaming events - Accept input_text type and EasyInputMessage for multi-turn input Verified: codex -p local and codex -p fast work against local llama.cpp with Qwen3.5 models including native tool calling. Refs: ggml-org/llama.cpp#19138, ggml-org/llama.cpp#19720 --- tools/server/server-common.cpp | 84 ++++++++++------ tools/server/server-task.cpp | 178 +++++++++++++++++++++++++-------- tools/server/server-task.h | 1 + 3 files changed, 192 insertions(+), 71 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ed5e306fc5..974823017b 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1255,6 +1255,25 @@ json convert_responses_to_chatcmpl(const json & response_body) { if (item.contains("status")) { item.erase("status"); } + // Merge system/developer messages into the first system message. + // Many model templates (e.g. Qwen) require all system content at + // position 0 and reject system messages elsewhere in the conversation. + if (item.at("role") == "system" || item.at("role") == "developer") { + if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") { + auto & first_msg = chatcmpl_messages[0]; + // Convert string content to array format if needed + if (first_msg["content"].is_string()) { + std::string old_text = first_msg["content"].get(); + first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}}); + } + auto & first_content = first_msg["content"]; + for (const auto & part : chatcmpl_content) { + first_content.push_back(part); + } + continue; // merged, don't push a separate message + } + item["role"] = "system"; + } item["content"] = chatcmpl_content; chatcmpl_messages.push_back(item); @@ -1266,35 +1285,25 @@ json convert_responses_to_chatcmpl(const json & response_body) { // item.at("status") == "completed" || // item.at("status") == "incomplete") && // item["status"] not sent by codex-cli - exists_and_is_string(item, "type") && - item.at("type") == "message" + // item["type"] == "message" for OutputMessage, absent for EasyInputMessage + (!item.contains("type") || item.at("type") == "message") ) { // #responses_create-input-input_item_list-item-output_message - auto chatcmpl_content = json::array(); + // Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant" + std::vector chatcmpl_content; for (const auto & output_text : item.at("content")) { const std::string type = json_value(output_text, "type", std::string()); - if (type == "output_text") { - if (!exists_and_is_string(output_text, "text")) { - throw std::invalid_argument("'Output text' requires 'text'"); - // Ignore annotations and logprobs for now - chatcmpl_content.push_back({ - {"text", output_text.at("text")}, - {"type", "text"}, - }); - } - } else if (type == "refusal") { - if (!exists_and_is_string(output_text, "refusal")) { - throw std::invalid_argument("'Refusal' requires 'refusal'"); - // Ignore annotations and logprobs for now - chatcmpl_content.push_back({ - {"refusal", output_text.at("refusal")}, - {"type", "refusal"}, - }); - } - } else { - throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'"); + if (type != "output_text" && type != "input_text") { + throw std::invalid_argument("'type' must be 'output_text' or 'input_text'"); } + if (!exists_and_is_string(output_text, "text")) { + throw std::invalid_argument("'Output text' requires 'text'"); + } + chatcmpl_content.push_back({ + {"text", output_text.at("text")}, + {"type", "text"}, + }); } if (merge_prev) { @@ -1303,7 +1312,9 @@ json convert_responses_to_chatcmpl(const json & response_body) { prev_msg["content"] = json::array(); } auto & prev_content = prev_msg["content"]; - prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end()); + for (const auto & part : chatcmpl_content) { + prev_content.push_back(part); + } } else { item.erase("status"); item.erase("type"); @@ -1407,11 +1418,17 @@ json convert_responses_to_chatcmpl(const json & response_body) { } std::vector chatcmpl_tools; for (json resp_tool : response_body.at("tools")) { - json chatcmpl_tool; + const std::string tool_type = json_value(resp_tool, "type", std::string()); - if (json_value(resp_tool, "type", std::string()) != "function") { - throw std::invalid_argument("'type' of tool must be 'function'"); + // Skip non-function tools (e.g. web_search, code_interpreter) + // sent by clients like Codex CLI — these are provider-specific + // and cannot be converted to chat completions function tools + if (tool_type != "function") { + SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str()); + continue; } + + json chatcmpl_tool; resp_tool.erase("type"); chatcmpl_tool["type"] = "function"; @@ -1422,7 +1439,9 @@ json convert_responses_to_chatcmpl(const json & response_body) { chatcmpl_tools.push_back(chatcmpl_tool); } chatcmpl_body.erase("tools"); - chatcmpl_body["tools"] = chatcmpl_tools; + if (!chatcmpl_tools.empty()) { + chatcmpl_body["tools"] = chatcmpl_tools; + } } if (response_body.contains("max_output_tokens")) { @@ -1430,6 +1449,15 @@ json convert_responses_to_chatcmpl(const json & response_body) { chatcmpl_body["max_tokens"] = response_body["max_output_tokens"]; } + // Strip Responses-only keys that have no chat completions equivalent + // (e.g. Codex CLI sends store, include, prompt_cache_key, web_search) + for (const char * key : { + "store", "include", "prompt_cache_key", "web_search", + "text", "truncation", "metadata", + }) { + chatcmpl_body.erase(key); + } + return chatcmpl_body; } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 3018ac90f8..96a7e3cb33 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -960,28 +960,66 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() { for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { output.push_back(json { {"type", "function_call"}, - {"status", "completed"}, - {"arguments", tool_call.arguments}, - {"call_id", "fc_" + tool_call.id}, + {"id", "fc_" + random_string()}, + {"call_id", tool_call.id}, {"name", tool_call.name}, + {"arguments", tool_call.arguments}, + {"status", "completed"}, }); } + // Build output_text convenience field (concatenation of all output_text parts) + std::string output_text; + for (const auto & item : output) { + if (json_value(item, "type", std::string()) == "message") { + for (const auto & part : item.at("content")) { + if (json_value(part, "type", std::string()) == "output_text") { + output_text += part.at("text").get(); + } + } + } + } + std::time_t t = std::time(0); json res = { - {"completed_at", t}, - {"created_at", t}, - {"id", oai_resp_id}, - {"model", oaicompat_model}, - {"object", "response"}, - {"output", output}, - {"status", "completed"}, - {"usage", json { - {"input_tokens", n_prompt_tokens}, - {"output_tokens", n_decoded}, - {"total_tokens", n_decoded + n_prompt_tokens}, - {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }}, + {"completed_at", t}, + {"created_at", t}, + {"id", oai_resp_id}, + {"model", oaicompat_model}, + {"object", "response"}, + {"output", output}, + {"output_text", output_text}, + {"status", "completed"}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + {"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}}, + {"output_tokens_details", json{{"reasoning_tokens", 0}}}, }}, + {"incomplete_details", nullptr}, + {"previous_response_id", nullptr}, + {"instructions", nullptr}, + {"error", nullptr}, + {"tools", json::array()}, + {"tool_choice", "auto"}, + {"truncation", "disabled"}, + {"parallel_tool_calls", false}, + {"text", json{{"format", json{{"type", "text"}}}}}, + {"top_p", 1.0}, + {"presence_penalty", 0.0}, + {"frequency_penalty", 0.0}, + {"top_logprobs", 0}, + {"temperature", 1.0}, + {"reasoning", nullptr}, + {"max_output_tokens", nullptr}, + {"max_tool_calls", nullptr}, + {"store", false}, + {"background", false}, + {"service_tier", "default"}, + {"safety_identifier", nullptr}, + {"prompt_cache_key", nullptr}, + {"metadata", json::object()}, }; return res; @@ -990,6 +1028,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() { json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { std::vector server_sent_events; std::vector output; + int & seq_num = oai_resp_seq_num; if (oaicompat_msg.reasoning_content != "") { const json output_item = json { @@ -1006,8 +1045,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { server_sent_events.push_back(json { {"event", "response.output_item.done"}, {"data", json { - {"type", "response.output_item.done"}, - {"item", output_item} + {"type", "response.output_item.done"}, + {"sequence_number", seq_num++}, + {"output_index", 0}, + {"item", output_item}, }} }); output.push_back(output_item); @@ -1017,9 +1058,13 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { server_sent_events.push_back(json { {"event", "response.output_text.done"}, {"data", json { - {"type", "response.output_text.done"}, - {"item_id", oai_resp_message_id}, - {"text", oaicompat_msg.content} + {"type", "response.output_text.done"}, + {"sequence_number", seq_num++}, + {"output_index", 0}, + {"content_index", 0}, + {"item_id", oai_resp_message_id}, + {"text", oaicompat_msg.content}, + {"logprobs", json::array()}, }} }); @@ -1033,9 +1078,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { server_sent_events.push_back(json { {"event", "response.content_part.done"}, {"data", json { - {"type", "response.content_part.done"}, - {"item_id", oai_resp_message_id}, - {"part", content_part} + {"type", "response.content_part.done"}, + {"sequence_number", seq_num++}, + {"output_index", 0}, + {"content_index", 0}, + {"item_id", oai_resp_message_id}, + {"part", content_part}, }} }); const json output_item = { @@ -1049,8 +1097,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { server_sent_events.push_back(json { {"event", "response.output_item.done"}, {"data", json { - {"type", "response.output_item.done"}, - {"item", output_item} + {"type", "response.output_item.done"}, + {"sequence_number", seq_num++}, + {"output_index", 0}, + {"item", output_item}, }} }); output.push_back(output_item); @@ -1059,39 +1109,81 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { const json output_item = { {"type", "function_call"}, - {"status", "completed"}, + {"id", "fc_" + random_string()}, + {"call_id", tool_call.id}, + {"name", tool_call.name}, {"arguments", tool_call.arguments}, - {"call_id", "fc_" + tool_call.id}, - {"name", tool_call.name} + {"status", "completed"}, }; server_sent_events.push_back(json { {"event", "response.output_item.done"}, {"data", json { - {"type", "response.output_item.done"}, - {"item", output_item} + {"type", "response.output_item.done"}, + {"sequence_number", seq_num++}, + {"output_index", 0}, + {"item", output_item}, }} }); output.push_back(output_item); } + // Build output_text convenience field for streaming final event + std::string output_text_stream; + for (const auto & item : output) { + if (json_value(item, "type", std::string()) == "message") { + for (const auto & part : item.at("content")) { + if (json_value(part, "type", std::string()) == "output_text") { + output_text_stream += part.at("text").get(); + } + } + } + } + std::time_t t = std::time(0); server_sent_events.push_back(json { {"event", "response.completed"}, {"data", json { - {"type", "response.completed"}, + {"type", "response.completed"}, + {"sequence_number", seq_num++}, {"response", json { - {"id", oai_resp_id}, - {"object", "response"}, - {"created_at", t}, - {"status", "completed"}, - {"model", oaicompat_model}, - {"output", output}, - {"usage", json { - {"input_tokens", n_prompt_tokens}, - {"output_tokens", n_decoded}, - {"total_tokens", n_decoded + n_prompt_tokens}, - {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }}, - }} + {"completed_at", t}, + {"created_at", t}, + {"id", oai_resp_id}, + {"object", "response"}, + {"status", "completed"}, + {"model", oaicompat_model}, + {"output", output}, + {"output_text", output_text_stream}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + {"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}}, + {"output_tokens_details", json{{"reasoning_tokens", 0}}}, + }}, + {"incomplete_details", nullptr}, + {"previous_response_id", nullptr}, + {"instructions", nullptr}, + {"error", nullptr}, + {"tools", json::array()}, + {"tool_choice", "auto"}, + {"truncation", "disabled"}, + {"parallel_tool_calls", false}, + {"text", json{{"format", json{{"type", "text"}}}}}, + {"top_p", 1.0}, + {"presence_penalty", 0.0}, + {"frequency_penalty", 0.0}, + {"top_logprobs", 0}, + {"temperature", 1.0}, + {"reasoning", nullptr}, + {"max_output_tokens", nullptr}, + {"max_tool_calls", nullptr}, + {"store", false}, + {"background", false}, + {"service_tier", "default"}, + {"safety_identifier", nullptr}, + {"prompt_cache_key", nullptr}, + {"metadata", json::object()}, }}, }} }); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a49ddb594b..28ec7b8f6b 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -370,6 +370,7 @@ struct server_task_result_cmpl_final : server_task_result { std::string oai_resp_id; std::string oai_resp_reasoning_id; std::string oai_resp_message_id; + int oai_resp_seq_num = 0; virtual bool is_stop() override { return true; // in stream mode, final responses are considered stop From 467266ba4cb29372c9ca12892df4c17fe7e78488 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Mon, 30 Mar 2026 12:46:54 +0200 Subject: [PATCH 2/7] server: add tests for Responses API compliance and Codex compatibility Add 8 new tests covering the changes in this PR: - test_responses_schema_fields: verify all 24+ Response object fields - test_responses_stream_schema_fields: verify sequence_number, output_index, content_index on streaming events - test_responses_non_function_tool_skipped: web_search/code_interpreter tool types return 200 instead of 400 - test_responses_mixed_tool_types: non-function tools filtered, function tools retained (not rejected at parsing layer) - test_responses_extra_keys_stripped: store, include, prompt_cache_key, web_search, text, truncation, metadata don't cause errors - test_responses_developer_role: developer messages merged into system - test_responses_input_text_type: input_text accepted for EasyInputMessage - test_responses_function_call_id_fields: output items have correct ids All 10 tests pass (2 existing + 8 new). --- tools/server/server-common.cpp | 27 +- tools/server/server-task.cpp | 192 +++++----- .../tests/unit/test_compat_oai_responses.py | 331 ++++++++++++++++++ 3 files changed, 429 insertions(+), 121 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 974823017b..58db4934fe 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1294,16 +1294,25 @@ json convert_responses_to_chatcmpl(const json & response_body) { for (const auto & output_text : item.at("content")) { const std::string type = json_value(output_text, "type", std::string()); - if (type != "output_text" && type != "input_text") { - throw std::invalid_argument("'type' must be 'output_text' or 'input_text'"); + if (type == "output_text" || type == "input_text") { + if (!exists_and_is_string(output_text, "text")) { + throw std::invalid_argument("'Output text' requires 'text'"); + } + chatcmpl_content.push_back({ + {"text", output_text.at("text")}, + {"type", "text"}, + }); + } else if (type == "refusal") { + if (!exists_and_is_string(output_text, "refusal")) { + throw std::invalid_argument("'Refusal' requires 'refusal'"); + } + chatcmpl_content.push_back({ + {"refusal", output_text.at("refusal")}, + {"type", "refusal"}, + }); + } else { + throw std::invalid_argument("'type' must be 'output_text', 'input_text', or 'refusal'"); } - if (!exists_and_is_string(output_text, "text")) { - throw std::invalid_argument("'Output text' requires 'text'"); - } - chatcmpl_content.push_back({ - {"text", output_text.at("text")}, - {"type", "text"}, - }); } if (merge_prev) { diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 96a7e3cb33..f9dc319a03 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -917,6 +917,70 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { return deltas; } +static std::string build_output_text(const std::vector & output) { + std::string result; + for (const auto & item : output) { + if (json_value(item, "type", std::string()) == "message") { + for (const auto & part : item.at("content")) { + if (json_value(part, "type", std::string()) == "output_text") { + result += part.at("text").get(); + } + } + } + } + return result; +} + +static json build_oai_resp_metadata(const std::string & oai_resp_id, + const std::string & oaicompat_model, + const std::vector & output, + const std::string & output_text, + int n_prompt_tokens, + int n_decoded, + int n_prompt_tokens_cache) { + std::time_t t = std::time(0); + return json { + {"completed_at", t}, + {"created_at", t}, + {"id", oai_resp_id}, + {"model", oaicompat_model}, + {"object", "response"}, + {"output", output}, + {"output_text", output_text}, + {"status", "completed"}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + {"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}}, + {"output_tokens_details", json{{"reasoning_tokens", 0}}}, + }}, + {"incomplete_details", nullptr}, + {"previous_response_id", nullptr}, + {"instructions", nullptr}, + {"error", nullptr}, + {"tools", json::array()}, + {"tool_choice", "auto"}, + {"truncation", "disabled"}, + {"parallel_tool_calls", false}, + {"text", json{{"format", json{{"type", "text"}}}}}, + {"top_p", 1.0}, + {"presence_penalty", 0.0}, + {"frequency_penalty", 0.0}, + {"top_logprobs", 0}, + {"temperature", 1.0}, + {"reasoning", nullptr}, + {"max_output_tokens", nullptr}, + {"max_tool_calls", nullptr}, + {"store", false}, + {"background", false}, + {"service_tier", "default"}, + {"safety_identifier", nullptr}, + {"prompt_cache_key", nullptr}, + {"metadata", json::object()}, + }; +} + json server_task_result_cmpl_final::to_json_oaicompat_resp() { common_chat_msg msg; if (!oaicompat_msg.empty()) { @@ -968,67 +1032,16 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() { }); } - // Build output_text convenience field (concatenation of all output_text parts) - std::string output_text; - for (const auto & item : output) { - if (json_value(item, "type", std::string()) == "message") { - for (const auto & part : item.at("content")) { - if (json_value(part, "type", std::string()) == "output_text") { - output_text += part.at("text").get(); - } - } - } - } - - std::time_t t = std::time(0); - json res = { - {"completed_at", t}, - {"created_at", t}, - {"id", oai_resp_id}, - {"model", oaicompat_model}, - {"object", "response"}, - {"output", output}, - {"output_text", output_text}, - {"status", "completed"}, - {"usage", json { - {"input_tokens", n_prompt_tokens}, - {"output_tokens", n_decoded}, - {"total_tokens", n_decoded + n_prompt_tokens}, - {"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}}, - {"output_tokens_details", json{{"reasoning_tokens", 0}}}, - }}, - {"incomplete_details", nullptr}, - {"previous_response_id", nullptr}, - {"instructions", nullptr}, - {"error", nullptr}, - {"tools", json::array()}, - {"tool_choice", "auto"}, - {"truncation", "disabled"}, - {"parallel_tool_calls", false}, - {"text", json{{"format", json{{"type", "text"}}}}}, - {"top_p", 1.0}, - {"presence_penalty", 0.0}, - {"frequency_penalty", 0.0}, - {"top_logprobs", 0}, - {"temperature", 1.0}, - {"reasoning", nullptr}, - {"max_output_tokens", nullptr}, - {"max_tool_calls", nullptr}, - {"store", false}, - {"background", false}, - {"service_tier", "default"}, - {"safety_identifier", nullptr}, - {"prompt_cache_key", nullptr}, - {"metadata", json::object()}, - }; - - return res; + std::string output_text = build_output_text(output); + return build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text, + n_prompt_tokens, n_decoded, n_prompt_tokens_cache); } json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { std::vector server_sent_events; std::vector output; int & seq_num = oai_resp_seq_num; + int output_idx = 0; if (oaicompat_msg.reasoning_content != "") { const json output_item = json { @@ -1047,11 +1060,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"data", json { {"type", "response.output_item.done"}, {"sequence_number", seq_num++}, - {"output_index", 0}, + {"output_index", output_idx}, {"item", output_item}, }} }); output.push_back(output_item); + output_idx++; } if (oaicompat_msg.content != "") { @@ -1060,7 +1074,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"data", json { {"type", "response.output_text.done"}, {"sequence_number", seq_num++}, - {"output_index", 0}, + {"output_index", output_idx}, {"content_index", 0}, {"item_id", oai_resp_message_id}, {"text", oaicompat_msg.content}, @@ -1080,7 +1094,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"data", json { {"type", "response.content_part.done"}, {"sequence_number", seq_num++}, - {"output_index", 0}, + {"output_index", output_idx}, {"content_index", 0}, {"item_id", oai_resp_message_id}, {"part", content_part}, @@ -1099,11 +1113,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"data", json { {"type", "response.output_item.done"}, {"sequence_number", seq_num++}, - {"output_index", 0}, + {"output_index", output_idx}, {"item", output_item}, }} }); output.push_back(output_item); + output_idx++; } for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { @@ -1120,71 +1135,24 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"data", json { {"type", "response.output_item.done"}, {"sequence_number", seq_num++}, - {"output_index", 0}, + {"output_index", output_idx}, {"item", output_item}, }} }); output.push_back(output_item); + output_idx++; } - // Build output_text convenience field for streaming final event - std::string output_text_stream; - for (const auto & item : output) { - if (json_value(item, "type", std::string()) == "message") { - for (const auto & part : item.at("content")) { - if (json_value(part, "type", std::string()) == "output_text") { - output_text_stream += part.at("text").get(); - } - } - } - } + std::string output_text = build_output_text(output); + json resp = build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text, + n_prompt_tokens, n_decoded, n_prompt_tokens_cache); - std::time_t t = std::time(0); server_sent_events.push_back(json { {"event", "response.completed"}, {"data", json { {"type", "response.completed"}, {"sequence_number", seq_num++}, - {"response", json { - {"completed_at", t}, - {"created_at", t}, - {"id", oai_resp_id}, - {"object", "response"}, - {"status", "completed"}, - {"model", oaicompat_model}, - {"output", output}, - {"output_text", output_text_stream}, - {"usage", json { - {"input_tokens", n_prompt_tokens}, - {"output_tokens", n_decoded}, - {"total_tokens", n_decoded + n_prompt_tokens}, - {"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}}, - {"output_tokens_details", json{{"reasoning_tokens", 0}}}, - }}, - {"incomplete_details", nullptr}, - {"previous_response_id", nullptr}, - {"instructions", nullptr}, - {"error", nullptr}, - {"tools", json::array()}, - {"tool_choice", "auto"}, - {"truncation", "disabled"}, - {"parallel_tool_calls", false}, - {"text", json{{"format", json{{"type", "text"}}}}}, - {"top_p", 1.0}, - {"presence_penalty", 0.0}, - {"frequency_penalty", 0.0}, - {"top_logprobs", 0}, - {"temperature", 1.0}, - {"reasoning", nullptr}, - {"max_output_tokens", nullptr}, - {"max_tool_calls", nullptr}, - {"store", false}, - {"background", false}, - {"service_tier", "default"}, - {"safety_identifier", nullptr}, - {"prompt_cache_key", nullptr}, - {"metadata", json::object()}, - }}, + {"response", resp}, }} }); diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py index 7aab4a8ba6..1ece5bf878 100644 --- a/tools/server/tests/unit/test_compat_oai_responses.py +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -71,3 +71,334 @@ def test_responses_stream_with_openai_library(): assert r.response.output[0].id.startswith("msg_") assert gathered_text == r.response.output_text assert match_regex("(Suddenly)+", r.response.output_text) + + +def test_responses_schema_fields(): + """Verify the 24 Response object fields added by this PR are present + with correct types and default values. These fields are required by + the OpenAI Responses API spec but were missing before this change.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": "Book", + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + body = res.body + # Usage sub-fields added by this PR + usage = body["usage"] + assert isinstance(usage["input_tokens_details"]["cached_tokens"], int) + assert isinstance(usage["output_tokens_details"]["reasoning_tokens"], int) + # All 24 fields added by this PR must be present with correct defaults + assert body["incomplete_details"] is None + assert body["previous_response_id"] is None + assert body["instructions"] is None + assert body["error"] is None + assert body["tools"] == [] + assert body["tool_choice"] == "auto" + assert body["truncation"] == "disabled" + assert body["parallel_tool_calls"] == False + assert body["text"] == {"format": {"type": "text"}} + assert body["top_p"] == 1.0 + assert body["temperature"] == 1.0 + assert body["presence_penalty"] == 0.0 + assert body["frequency_penalty"] == 0.0 + assert body["top_logprobs"] == 0 + assert body["reasoning"] is None + assert body["max_output_tokens"] is None + assert body["store"] == False + assert body["service_tier"] == "default" + assert body["metadata"] == {} + assert body["background"] == False + assert body["safety_identifier"] is None + assert body["prompt_cache_key"] is None + assert body["max_tool_calls"] is None + + +def test_responses_stream_schema_fields(): + """Verify streaming done-events have the sequence_number, output_index, + and content_index fields added by this PR. Also verify the completed + response includes the 24 new schema fields.""" + global server + server.start() + res = server.make_stream_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": "Book", + "max_output_tokens": 8, + "temperature": 0.8, + "stream": True, + }) + seen_seq_nums = [] + saw_output_text_done = False + saw_content_part_done = False + saw_output_item_done = False + completed_response = None + for data in res: + if "sequence_number" in data: + seen_seq_nums.append(data["sequence_number"]) + if data.get("type") == "response.output_text.done": + saw_output_text_done = True + assert "content_index" in data + assert "output_index" in data + assert "logprobs" in data + assert isinstance(data["logprobs"], list) + if data.get("type") == "response.content_part.done": + saw_content_part_done = True + assert "content_index" in data + assert "output_index" in data + if data.get("type") == "response.output_item.done": + saw_output_item_done = True + assert "output_index" in data + if data.get("type") == "response.completed": + completed_response = data["response"] + # Must have seen all done-event types + assert saw_output_text_done, "never received response.output_text.done" + assert saw_content_part_done, "never received response.content_part.done" + assert saw_output_item_done, "never received response.output_item.done" + # sequence_number must be present on done events and monotonically increasing + assert len(seen_seq_nums) >= 4, f"expected >= 4 sequenced events, got {len(seen_seq_nums)}" + assert all(a < b for a, b in zip(seen_seq_nums, seen_seq_nums[1:])), "sequence_numbers not strictly increasing" + # completed response must have the new schema fields with correct values + assert completed_response is not None + assert completed_response["metadata"] == {} + assert completed_response["store"] == False + assert completed_response["truncation"] == "disabled" + assert completed_response["usage"]["output_tokens_details"]["reasoning_tokens"] == 0 + + +def test_responses_non_function_tool_skipped(): + """Non-function tool types must be silently skipped, producing a valid + completion with no tools field in the converted chat request. Upstream + rejects non-function types with 400; our code must return 200 and + generate output as if no tools were provided.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "tools": [ + {"type": "web_search"}, + {"type": "code_interpreter"}, + ], + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + # With all tools skipped, the model must still produce text output + assert len(res.body["output"]) > 0 + assert len(res.body["output_text"]) > 0 + + +def test_responses_only_non_function_tools_same_as_no_tools(): + """When ALL tools are non-function types, they should all be filtered out + and the result should be identical to a request with no tools at all. + Compare token counts to confirm the tools field was truly empty.""" + global server + server.start() + no_tools = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + with_skipped_tools = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "tools": [ + {"type": "web_search"}, + {"type": "code_interpreter"}, + {"type": "file_search"}, + ], + }) + assert no_tools.status_code == 200 + assert with_skipped_tools.status_code == 200 + # If tools were truly stripped, prompt token count must be identical + assert with_skipped_tools.body["usage"]["input_tokens"] == no_tools.body["usage"]["input_tokens"] + + +def test_responses_extra_keys_stripped(): + """Responses-only request keys (store, include, prompt_cache_key, etc.) + must be stripped before forwarding to the chat completions handler. + The completion must succeed and produce the same output as a request + without those keys.""" + global server + server.start() + # Baseline without extra keys + baseline = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert baseline.status_code == 200 + # Same request with extra Responses-only keys + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "store": True, + "include": ["usage"], + "prompt_cache_key": "test_key", + "web_search": {"enabled": True}, + "text": {"format": {"type": "text"}}, + "truncation": "auto", + "metadata": {"key": "value"}, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + # Extra keys must not affect token consumption + assert res.body["usage"]["input_tokens"] == baseline.body["usage"]["input_tokens"] + + +def test_responses_developer_role_merging(): + """Developer role messages must be merged into the first system message + at position 0. This ensures templates that require a single system + message don't see developer content as a separate turn. + + We verify by comparing token counts: system + developer merged should + consume the same prompt tokens as a single system message with the + combined content.""" + global server + server.start() + # Single combined system message + combined = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": [ + {"type": "input_text", "text": "Book"}, + {"type": "input_text", "text": "Keep it short"}, + ]}, + {"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert combined.status_code == 200 + # Split system + developer (should be merged to same prompt) + split = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": [{"type": "input_text", "text": "Book"}]}, + {"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]}, + {"role": "developer", "content": [{"type": "input_text", "text": "Keep it short"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert split.status_code == 200 + assert split.body["status"] == "completed" + # Merged prompt should consume same number of input tokens + assert split.body["usage"]["input_tokens"] == combined.body["usage"]["input_tokens"] + + +def test_responses_input_text_type_multi_turn(): + """input_text type must be accepted for assistant messages (EasyInputMessage). + An assistant message without explicit type:'message' must also be accepted + (AssistantMessageItemParam). Verify the multi-turn context is preserved + by checking the model sees the full conversation.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": [{"type": "input_text", "text": "Hello"}]}, + { + "role": "assistant", + "content": [{"type": "input_text", "text": "Hi there"}], + }, + {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + # Multi-turn input should result in more prompt tokens than single-turn + single = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": "How are you", + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert single.status_code == 200 + assert res.body["usage"]["input_tokens"] > single.body["usage"]["input_tokens"] + + +def test_responses_output_text_matches_content(): + """output_text must be the concatenation of all output_text content parts. + Verify this for both streaming and non-streaming responses.""" + global server + server.start() + # Non-streaming + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + # Manually reconstruct output_text from content parts + reconstructed = "" + for item in res.body["output"]: + if item.get("type") == "message": + for part in item["content"]: + if part.get("type") == "output_text": + reconstructed += part["text"] + assert res.body["output_text"] == reconstructed + assert len(reconstructed) > 0 + + +def test_responses_stream_output_text_consistency(): + """Streaming gathered text must match the output_text in response.completed.""" + global server + server.start() + res = server.make_stream_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "stream": True, + }) + gathered_text = "" + completed_output_text = None + for data in res: + if data.get("type") == "response.output_text.delta": + gathered_text += data["delta"] + if data.get("type") == "response.completed": + completed_output_text = data["response"]["output_text"] + # Also verify content parts match + for item in data["response"]["output"]: + if item.get("type") == "message": + for part in item["content"]: + if part.get("type") == "output_text": + assert part["text"] == gathered_text + assert completed_output_text is not None + assert gathered_text == completed_output_text + assert len(gathered_text) > 0 From 987340767cf5639e30931db7d0381aa4887acdcc Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Mon, 30 Mar 2026 18:13:20 +0200 Subject: [PATCH 3/7] server: add full streaming compliance for Responses API events - Add sequence_number to ALL streaming events (created, in_progress, output_item.added, content_part.added, all delta events) - Add output_index to all events referencing output items - Add content_index to content-related events - Populate full response object in response.created and response.in_progress events (was only {id, object, status}) - Add id field to function_call output_item.added events - Add status: completed to reasoning output_item.done events - Counter state persisted across streaming chunks via task_result_state Fixes: spec-compliant client libraries (async-openai) that require these fields can now parse all streaming events without error. Refs: ggml-org/llama.cpp#21174 (fumlig review comment) --- tools/server/server-task.cpp | 106 +++++++++++++++++++++++++---------- tools/server/server-task.h | 5 ++ 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index f9dc319a03..5d63e6b697 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1053,6 +1053,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { {"type", "reasoning_text"}, }})}, {"encrypted_content", ""}, + {"status", "completed"}, }; server_sent_events.push_back(json { @@ -1428,20 +1429,42 @@ void server_task_result_cmpl_partial::update(task_result_state & state) { oai_resp_reasoning_id = state.oai_resp_reasoning_id; oai_resp_message_id = state.oai_resp_message_id; oai_resp_fc_id = state.oai_resp_fc_id; + // seq_num/output_idx: read from state (may have been advanced by previous to_json call) + oai_resp_seq_num = state.oai_resp_seq_num; + oai_resp_output_idx = state.oai_resp_output_idx; // track if the accumulated message has any reasoning content anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty(); // Pre-compute state updates based on diffs (for next chunk) + // Also advance seq_num/output_idx to match events that to_json_oaicompat_resp() will emit + if (n_decoded == 1) { + state.oai_resp_seq_num += 2; // response.created + response.in_progress + } for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) { - if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) { - state.thinking_block_started = true; + if (!diff.reasoning_content_delta.empty()) { + if (!state.thinking_block_started) { + state.thinking_block_started = true; + state.oai_resp_seq_num++; // output_item.added + state.oai_resp_output_idx++; + } + state.oai_resp_seq_num++; // reasoning_text.delta } - if (!diff.content_delta.empty() && !state.text_block_started) { - state.text_block_started = true; + if (!diff.content_delta.empty()) { + if (!state.text_block_started) { + state.text_block_started = true; + state.oai_resp_seq_num += 2; // output_item.added + content_part.added + state.oai_resp_output_idx++; + } + state.oai_resp_seq_num++; // output_text.delta } if (!diff.tool_call_delta.name.empty()) { state.oai_resp_fc_id = diff.tool_call_delta.id; + state.oai_resp_seq_num++; // output_item.added + state.oai_resp_output_idx++; + } + if (!diff.tool_call_delta.arguments.empty()) { + state.oai_resp_seq_num++; // function_call_arguments.delta } } } @@ -1583,28 +1606,31 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() { json server_task_result_cmpl_partial::to_json_oaicompat_resp() { std::vector events; + int & seq_num = oai_resp_seq_num; + int & output_idx = oai_resp_output_idx; if (n_decoded == 1) { + // Build initial response object with all required fields but empty output + json initial_resp = build_oai_resp_metadata( + oai_resp_id, oaicompat_model, {}, "", + n_prompt_tokens, 0, n_prompt_tokens_cache); + initial_resp["status"] = "in_progress"; + initial_resp["completed_at"] = nullptr; + events.push_back(json { {"event", "response.created"}, {"data", json { - {"type", "response.created"}, - {"response", json { - {"id", oai_resp_id}, - {"object", "response"}, - {"status", "in_progress"}, - }}, + {"type", "response.created"}, + {"sequence_number", seq_num++}, + {"response", initial_resp}, }}, }); events.push_back(json { {"event", "response.in_progress"}, {"data", json { - {"type", "response.in_progress"}, - {"response", json { - {"id", oai_resp_id}, - {"object", "response"}, - {"status", "in_progress"}, - }}, + {"type", "response.in_progress"}, + {"sequence_number", seq_num++}, + {"response", initial_resp}, }}, }); } @@ -1615,7 +1641,9 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.output_item.added"}, {"data", json { - {"type", "response.output_item.added"}, + {"type", "response.output_item.added"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx++}, {"item", json { {"id", oai_resp_reasoning_id}, {"summary", json::array()}, @@ -1631,9 +1659,12 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.reasoning_text.delta"}, {"data", json { - {"type", "response.reasoning_text.delta"}, - {"delta", diff.reasoning_content_delta}, - {"item_id", oai_resp_reasoning_id}, + {"type", "response.reasoning_text.delta"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx - 1}, + {"content_index", 0}, + {"delta", diff.reasoning_content_delta}, + {"item_id", oai_resp_reasoning_id}, }}, }); } @@ -1643,7 +1674,9 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.output_item.added"}, {"data", json { - {"type", "response.output_item.added"}, + {"type", "response.output_item.added"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx++}, {"item", json { {"content", json::array()}, {"id", oai_resp_message_id}, @@ -1656,8 +1689,11 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.content_part.added"}, {"data", json { - {"type", "response.content_part.added"}, - {"item_id", oai_resp_message_id}, + {"type", "response.content_part.added"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx - 1}, + {"content_index", 0}, + {"item_id", oai_resp_message_id}, {"part", json { {"type", "output_text"}, {"text", ""}, @@ -1669,9 +1705,12 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.output_text.delta"}, {"data", json { - {"type", "response.output_text.delta"}, - {"item_id", oai_resp_message_id}, - {"delta", diff.content_delta}, + {"type", "response.output_text.delta"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx - 1}, + {"content_index", 0}, + {"item_id", oai_resp_message_id}, + {"delta", diff.content_delta}, }}, }); } @@ -1680,10 +1719,13 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.output_item.added"}, {"data", json { - {"type", "response.output_item.added"}, + {"type", "response.output_item.added"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx++}, {"item", json { + {"id", "fc_" + random_string()}, {"arguments", ""}, - {"call_id", "fc_" + diff.tool_call_delta.id}, + {"call_id", diff.tool_call_delta.id}, {"name", diff.tool_call_delta.name}, {"type", "function_call"}, {"status", "in_progress"}, @@ -1697,9 +1739,11 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { events.push_back(json { {"event", "response.function_call_arguments.delta"}, {"data", json { - {"type", "response.function_call_arguments.delta"}, - {"delta", diff.tool_call_delta.arguments}, - {"item_id", "fc_" + oai_resp_fc_id}, + {"type", "response.function_call_arguments.delta"}, + {"sequence_number", seq_num++}, + {"output_index", output_idx - 1}, + {"delta", diff.tool_call_delta.arguments}, + {"item_id", "fc_" + oai_resp_fc_id}, }}, }); } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 28ec7b8f6b..a4ce0449a3 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -110,6 +110,8 @@ struct task_result_state { const std::string oai_resp_reasoning_id; const std::string oai_resp_message_id; std::string oai_resp_fc_id; // function call ID for current args delta + int oai_resp_seq_num = 0; // monotonically increasing per-stream + int oai_resp_output_idx = 0; // tracks current output item index task_result_state(const common_chat_parser_params & chat_parser_params) : chat_parser_params(chat_parser_params) @@ -385,6 +387,7 @@ struct server_task_result_cmpl_final : server_task_result { oai_resp_id = state.oai_resp_id; oai_resp_reasoning_id = state.oai_resp_reasoning_id; oai_resp_message_id = state.oai_resp_message_id; + oai_resp_seq_num = state.oai_resp_seq_num; } json to_json_non_oaicompat(); @@ -437,6 +440,8 @@ struct server_task_result_cmpl_partial : server_task_result { std::string oai_resp_reasoning_id; std::string oai_resp_message_id; std::string oai_resp_fc_id; + int oai_resp_seq_num = 0; + int oai_resp_output_idx = 0; // for Anthropic API: track if any reasoning content has been generated bool anthropic_has_reasoning = false; From 5d51bbef1c679babf4b085b11b12e7fc52ce4d6a Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Mon, 30 Mar 2026 18:13:29 +0200 Subject: [PATCH 4/7] server: add streaming compliance tests for Responses API - test_responses_stream_created_event_has_full_response: verify response.created contains all 24+ fields with status in_progress - test_responses_stream_all_events_have_sequence_number: every event has sequence_number and they are strictly increasing across stream - test_responses_stream_delta_events_have_indices: output_index and content_index present on all delta/added events All 14 tests pass (2 original + 9 from previous commit + 3 new). --- .../tests/unit/test_compat_oai_responses.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py index 1ece5bf878..2f720f0809 100644 --- a/tools/server/tests/unit/test_compat_oai_responses.py +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -402,3 +402,108 @@ def test_responses_stream_output_text_consistency(): assert completed_output_text is not None assert gathered_text == completed_output_text assert len(gathered_text) > 0 + + +def test_responses_stream_created_event_has_full_response(): + """response.created must contain the full response object with all required + fields, not just {id, object, status}. This is needed by strict client + libraries like async-openai.""" + global server + server.start() + res = server.make_stream_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "stream": True, + }) + created_resp = None + in_progress_resp = None + for data in res: + if data.get("type") == "response.created": + created_resp = data["response"] + if data.get("type") == "response.in_progress": + in_progress_resp = data["response"] + assert created_resp is not None, "never received response.created" + assert in_progress_resp is not None, "never received response.in_progress" + # Both must have the full response object, not just minimal fields + for resp in [created_resp, in_progress_resp]: + assert resp["status"] == "in_progress" + assert resp["id"].startswith("resp_") + assert resp["object"] == "response" + assert resp["model"] is not None + assert "metadata" in resp + assert "store" in resp + assert "truncation" in resp + assert "tools" in resp + assert "usage" in resp + assert resp["output"] == [] + assert resp["output_text"] == "" + + +def test_responses_stream_all_events_have_sequence_number(): + """Every streaming event must have a sequence_number field and they must + be strictly increasing across the entire stream.""" + global server + server.start() + res = server.make_stream_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "stream": True, + }) + all_seq_nums = [] + event_types = [] + for data in res: + assert "sequence_number" in data, f"missing sequence_number in event type {data.get('type')}" + all_seq_nums.append(data["sequence_number"]) + event_types.append(data.get("type", "unknown")) + # Must have received multiple events + assert len(all_seq_nums) >= 6, f"expected >= 6 events, got {len(all_seq_nums)}: {event_types}" + # Must be strictly increasing + for i in range(1, len(all_seq_nums)): + assert all_seq_nums[i] > all_seq_nums[i-1], \ + f"sequence_number not strictly increasing at index {i}: {all_seq_nums[i-1]} -> {all_seq_nums[i]} (events: {event_types[i-1]} -> {event_types[i]})" + + +def test_responses_stream_delta_events_have_indices(): + """Delta and added events must have output_index. Content-related events + must also have content_index.""" + global server + server.start() + res = server.make_stream_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + "stream": True, + }) + saw_output_item_added = False + saw_content_part_added = False + saw_output_text_delta = False + for data in res: + evt = data.get("type", "") + if evt == "response.output_item.added": + saw_output_item_added = True + assert "output_index" in data, "output_item.added missing output_index" + if evt == "response.content_part.added": + saw_content_part_added = True + assert "output_index" in data, "content_part.added missing output_index" + assert "content_index" in data, "content_part.added missing content_index" + if evt == "response.output_text.delta": + saw_output_text_delta = True + assert "output_index" in data, "output_text.delta missing output_index" + assert "content_index" in data, "output_text.delta missing content_index" + assert saw_output_item_added, "never received response.output_item.added" + assert saw_content_part_added, "never received response.content_part.added" + assert saw_output_text_delta, "never received response.output_text.delta" From 35f62f9eb3a8337b6dc946ce7f24421bc7716465 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Mon, 30 Mar 2026 18:24:39 +0200 Subject: [PATCH 5/7] server: fix streaming event bugs and tighten test assertions Code fixes: - build_oai_resp_metadata accepts status param; completed_at is null when status is in_progress (was always set to timestamp) - response.created/in_progress events use zeroed usage (was passing actual prompt tokens before response was logically started) - Function call item IDs are now generated once per tool call in update() and reused consistently across output_item.added, function_call_arguments.delta, and output_item.done events (was generating independent random IDs in each path) - Clean up commented-out status checks in server-common.cpp Test fixes: - Assert sequence_number on every event unconditionally (was using weak "if present" guard) - Check actual values not just key presence in streaming created event test (completed_at is None, usage tokens are 0, etc.) Refs: ggml-org/llama.cpp#21174 (patrick review) --- tools/server/server-common.cpp | 8 ++--- tools/server/server-task.cpp | 30 +++++++++++-------- tools/server/server-task.h | 11 +++++-- .../tests/unit/test_compat_oai_responses.py | 16 +++++----- 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 58db4934fe..ae45f24f74 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1280,12 +1280,8 @@ json convert_responses_to_chatcmpl(const json & response_body) { } else if (exists_and_is_array(item, "content") && exists_and_is_string(item, "role") && item.at("role") == "assistant" && - // exists_and_is_string(item, "status") && - // (item.at("status") == "in_progress" || - // item.at("status") == "completed" || - // item.at("status") == "incomplete") && - // item["status"] not sent by codex-cli - // item["type"] == "message" for OutputMessage, absent for EasyInputMessage + // status not checked (not always present, e.g. codex-cli omits it) + // type == "message" for OutputMessage, absent for EasyInputMessage (!item.contains("type") || item.at("type") == "message") ) { // #responses_create-input-input_item_list-item-output_message diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 5d63e6b697..b2de62d86f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -937,17 +937,18 @@ static json build_oai_resp_metadata(const std::string & oai_resp_id, const std::string & output_text, int n_prompt_tokens, int n_decoded, - int n_prompt_tokens_cache) { + int n_prompt_tokens_cache, + const std::string & status = "completed") { std::time_t t = std::time(0); return json { - {"completed_at", t}, + {"completed_at", status == "completed" ? json(t) : json(nullptr)}, {"created_at", t}, {"id", oai_resp_id}, {"model", oaicompat_model}, {"object", "response"}, {"output", output}, {"output_text", output_text}, - {"status", "completed"}, + {"status", status}, {"usage", json { {"input_tokens", n_prompt_tokens}, {"output_tokens", n_decoded}, @@ -1122,10 +1123,14 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { output_idx++; } - for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { + for (size_t tc_idx = 0; tc_idx < oaicompat_msg.tool_calls.size(); tc_idx++) { + const common_chat_tool_call & tool_call = oaicompat_msg.tool_calls[tc_idx]; + const std::string fc_id = tc_idx < oai_resp_fc_item_ids.size() + ? oai_resp_fc_item_ids[tc_idx] + : "fc_" + random_string(); // fallback for non-streaming path const json output_item = { {"type", "function_call"}, - {"id", "fc_" + random_string()}, + {"id", fc_id}, {"call_id", tool_call.id}, {"name", tool_call.name}, {"arguments", tool_call.arguments}, @@ -1429,7 +1434,7 @@ void server_task_result_cmpl_partial::update(task_result_state & state) { oai_resp_reasoning_id = state.oai_resp_reasoning_id; oai_resp_message_id = state.oai_resp_message_id; oai_resp_fc_id = state.oai_resp_fc_id; - // seq_num/output_idx: read from state (may have been advanced by previous to_json call) + oai_resp_fc_item_id = state.oai_resp_fc_item_id; oai_resp_seq_num = state.oai_resp_seq_num; oai_resp_output_idx = state.oai_resp_output_idx; @@ -1460,6 +1465,8 @@ void server_task_result_cmpl_partial::update(task_result_state & state) { } if (!diff.tool_call_delta.name.empty()) { state.oai_resp_fc_id = diff.tool_call_delta.id; + state.oai_resp_fc_item_id = "fc_" + random_string(); + state.oai_resp_fc_item_ids.push_back(state.oai_resp_fc_item_id); state.oai_resp_seq_num++; // output_item.added state.oai_resp_output_idx++; } @@ -1610,12 +1617,10 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { int & output_idx = oai_resp_output_idx; if (n_decoded == 1) { - // Build initial response object with all required fields but empty output + // Build initial response object with all required fields but empty output and zeroed usage json initial_resp = build_oai_resp_metadata( oai_resp_id, oaicompat_model, {}, "", - n_prompt_tokens, 0, n_prompt_tokens_cache); - initial_resp["status"] = "in_progress"; - initial_resp["completed_at"] = nullptr; + 0, 0, 0, "in_progress"); events.push_back(json { {"event", "response.created"}, @@ -1723,7 +1728,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { {"sequence_number", seq_num++}, {"output_index", output_idx++}, {"item", json { - {"id", "fc_" + random_string()}, + {"id", oai_resp_fc_item_id}, {"arguments", ""}, {"call_id", diff.tool_call_delta.id}, {"name", diff.tool_call_delta.name}, @@ -1732,7 +1737,6 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { }}, }}, }); - oai_resp_fc_id = diff.tool_call_delta.id; } if (!diff.tool_call_delta.arguments.empty()) { @@ -1743,7 +1747,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { {"sequence_number", seq_num++}, {"output_index", output_idx - 1}, {"delta", diff.tool_call_delta.arguments}, - {"item_id", "fc_" + oai_resp_fc_id}, + {"item_id", oai_resp_fc_item_id}, }}, }); } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a4ce0449a3..49040445d3 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -109,9 +109,11 @@ struct task_result_state { const std::string oai_resp_id; const std::string oai_resp_reasoning_id; const std::string oai_resp_message_id; - std::string oai_resp_fc_id; // function call ID for current args delta - int oai_resp_seq_num = 0; // monotonically increasing per-stream - int oai_resp_output_idx = 0; // tracks current output item index + std::string oai_resp_fc_id; // model's tool_call ID for current function call + std::string oai_resp_fc_item_id; // our generated fc_ item ID for current function call + std::vector oai_resp_fc_item_ids; // all generated fc_ IDs, in order of tool call appearance + int oai_resp_seq_num = 0; // monotonically increasing per-stream + int oai_resp_output_idx = 0; // tracks current output item index task_result_state(const common_chat_parser_params & chat_parser_params) : chat_parser_params(chat_parser_params) @@ -372,6 +374,7 @@ struct server_task_result_cmpl_final : server_task_result { std::string oai_resp_id; std::string oai_resp_reasoning_id; std::string oai_resp_message_id; + std::vector oai_resp_fc_item_ids; int oai_resp_seq_num = 0; virtual bool is_stop() override { @@ -387,6 +390,7 @@ struct server_task_result_cmpl_final : server_task_result { oai_resp_id = state.oai_resp_id; oai_resp_reasoning_id = state.oai_resp_reasoning_id; oai_resp_message_id = state.oai_resp_message_id; + oai_resp_fc_item_ids = state.oai_resp_fc_item_ids; oai_resp_seq_num = state.oai_resp_seq_num; } @@ -440,6 +444,7 @@ struct server_task_result_cmpl_partial : server_task_result { std::string oai_resp_reasoning_id; std::string oai_resp_message_id; std::string oai_resp_fc_id; + std::string oai_resp_fc_item_id; int oai_resp_seq_num = 0; int oai_resp_output_idx = 0; diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py index 2f720f0809..fac6310214 100644 --- a/tools/server/tests/unit/test_compat_oai_responses.py +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -136,8 +136,8 @@ def test_responses_stream_schema_fields(): saw_output_item_done = False completed_response = None for data in res: - if "sequence_number" in data: - seen_seq_nums.append(data["sequence_number"]) + assert "sequence_number" in data, f"missing sequence_number in {data.get('type')}" + seen_seq_nums.append(data["sequence_number"]) if data.get("type") == "response.output_text.done": saw_output_text_done = True assert "content_index" in data @@ -435,11 +435,13 @@ def test_responses_stream_created_event_has_full_response(): assert resp["id"].startswith("resp_") assert resp["object"] == "response" assert resp["model"] is not None - assert "metadata" in resp - assert "store" in resp - assert "truncation" in resp - assert "tools" in resp - assert "usage" in resp + assert resp["completed_at"] is None + assert resp["metadata"] == {} + assert resp["store"] == False + assert resp["truncation"] == "disabled" + assert resp["tools"] == [] + assert resp["usage"]["input_tokens"] == 0 + assert resp["usage"]["output_tokens"] == 0 assert resp["output"] == [] assert resp["output_text"] == "" From adef64cb9ffeadbe075dcfe302232d2b6654b1e0 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Tue, 31 Mar 2026 06:37:49 +0200 Subject: [PATCH 7/7] server: fix reasoning item content format handling for multi-turn Accept all valid reasoning item content formats in multi-turn input: - Array of objects: [{"type":"reasoning_text","text":"..."}] (spec format) - Plain string: "thinking about it" (OpenCode format) - Null: content:null with encrypted_content (Codex, openai/codex#11834) - Omitted entirely: no content field present Previously threw "item['content'] is not an array" for non-array formats, breaking OpenCode multi-turn conversations. The encrypted_content field is accepted but ignored for local models (no server-side decryption). Add 4 tests covering each format variant. Refs: openai/codex#11834, anomalyco/opencode#19081 --- tools/server/server-common.cpp | 26 +++--- .../tests/unit/test_compat_oai_responses.py | 83 +++++++++++++++++++ 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ae45f24f74..bde76e8392 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1387,24 +1387,30 @@ json convert_responses_to_chatcmpl(const json & response_body) { item.at("type") == "reasoning") { // #responses_create-input-input_item_list-item-reasoning - if (!exists_and_is_array(item, "content")) { - throw std::invalid_argument("item['content'] is not an array"); - } - if (item.at("content").empty()) { - throw std::invalid_argument("item['content'] is empty"); - } - if (!exists_and_is_string(item.at("content")[0], "text")) { - throw std::invalid_argument("item['content']['text'] is not a string"); + // content can be: null, omitted, a string, or array of {type, text} objects. + // Codex may send content:null or omit it entirely (issue openai/codex#11834). + // OpenCode may send content as a plain string. + // The spec uses array format: [{"type":"reasoning_text","text":"..."}]. + // encrypted_content (opaque string) is accepted but ignored for local models. + std::string reasoning_text; + if (!item.contains("content") || item.at("content").is_null()) { + // null or missing content — skip (encrypted_content only, or empty reasoning) + } else if (item.at("content").is_string()) { + reasoning_text = item.at("content").get(); + } else if (item.at("content").is_array() && !item.at("content").empty() + && exists_and_is_string(item.at("content")[0], "text")) { + reasoning_text = item.at("content")[0].at("text").get(); } + // else: empty array or unrecognized format — treat as empty reasoning if (merge_prev) { auto & prev_msg = chatcmpl_messages.back(); - prev_msg["reasoning_content"] = item.at("content")[0].at("text"); + prev_msg["reasoning_content"] = reasoning_text; } else { chatcmpl_messages.push_back(json { {"role", "assistant"}, {"content", json::array()}, - {"reasoning_content", item.at("content")[0].at("text")}, + {"reasoning_content", reasoning_text}, }); } } else { diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py index fac6310214..0bd2989755 100644 --- a/tools/server/tests/unit/test_compat_oai_responses.py +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -509,3 +509,86 @@ def test_responses_stream_delta_events_have_indices(): assert saw_output_item_added, "never received response.output_item.added" assert saw_content_part_added, "never received response.content_part.added" assert saw_output_text_delta, "never received response.output_text.delta" + + +def test_responses_reasoning_content_array(): + """Reasoning items with content as array (spec format) must be accepted.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}, + {"type": "reasoning", "summary": [], + "content": [{"type": "reasoning_text", "text": "thinking"}]}, + {"role": "assistant", "type": "message", + "content": [{"type": "output_text", "text": "Hello"}]}, + {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + + +def test_responses_reasoning_content_string(): + """Reasoning items with content as plain string (OpenCode format) must be accepted.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}, + {"type": "reasoning", "summary": [], "content": "thinking about it"}, + {"role": "assistant", "type": "message", + "content": [{"type": "output_text", "text": "Hello"}]}, + {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + + +def test_responses_reasoning_content_null(): + """Reasoning items with content:null (Codex format, issue openai/codex#11834) + must be accepted — content may be null when encrypted_content is present.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}, + {"type": "reasoning", "summary": [], "content": None, + "encrypted_content": "opaque_data_here"}, + {"role": "assistant", "type": "message", + "content": [{"type": "output_text", "text": "Hello"}]}, + {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed" + + +def test_responses_reasoning_content_omitted(): + """Reasoning items with content omitted entirely must be accepted.""" + global server + server.start() + res = server.make_request("POST", "/v1/responses", data={ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}, + {"type": "reasoning", "summary": []}, + {"role": "assistant", "type": "message", + "content": [{"type": "output_text", "text": "Hello"}]}, + {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]}, + ], + "max_output_tokens": 8, + "temperature": 0.8, + }) + assert res.status_code == 200 + assert res.body["status"] == "completed"