server: improve Responses API compliance and Codex CLI compatibility

Codex CLI compatibility:
- Skip non-function tool types (web_search, code_interpreter)
- Merge developer/system messages into position 0 for Qwen templates
- Strip Responses-only request keys (store, include, prompt_cache_key)
- output_text convenience field in streaming and non-streaming responses

Responses API compliance (ideas from #19720 by riskywindow, adapted):
- Add 24 missing Response object fields per OpenAI spec
- Fix function_call id/call_id field mapping
- Add sequence_number, output_index, content_index to streaming events
- Accept input_text type and EasyInputMessage for multi-turn input

Verified: codex -p local and codex -p fast work against local
llama.cpp with Qwen3.5 models including native tool calling.

Refs: ggml-org/llama.cpp#19138, ggml-org/llama.cpp#19720
This commit is contained in:
Christopher Albert 2026-03-30 09:39:59 +02:00
parent 7c203670f8
commit 302c3c8f61
3 changed files with 192 additions and 71 deletions

View File

@ -1255,6 +1255,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
if (item.contains("status")) {
item.erase("status");
}
// Merge system/developer messages into the first system message.
// Many model templates (e.g. Qwen) require all system content at
// position 0 and reject system messages elsewhere in the conversation.
if (item.at("role") == "system" || item.at("role") == "developer") {
if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") {
auto & first_msg = chatcmpl_messages[0];
// Convert string content to array format if needed
if (first_msg["content"].is_string()) {
std::string old_text = first_msg["content"].get<std::string>();
first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}});
}
auto & first_content = first_msg["content"];
for (const auto & part : chatcmpl_content) {
first_content.push_back(part);
}
continue; // merged, don't push a separate message
}
item["role"] = "system";
}
item["content"] = chatcmpl_content;
chatcmpl_messages.push_back(item);
@ -1266,35 +1285,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
// item.at("status") == "completed" ||
// item.at("status") == "incomplete") &&
// item["status"] not sent by codex-cli
exists_and_is_string(item, "type") &&
item.at("type") == "message"
// item["type"] == "message" for OutputMessage, absent for EasyInputMessage
(!item.contains("type") || item.at("type") == "message")
) {
// #responses_create-input-input_item_list-item-output_message
auto chatcmpl_content = json::array();
// Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant"
std::vector<json> chatcmpl_content;
for (const auto & output_text : item.at("content")) {
const std::string type = json_value(output_text, "type", std::string());
if (type == "output_text") {
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
// Ignore annotations and logprobs for now
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}
} else if (type == "refusal") {
if (!exists_and_is_string(output_text, "refusal")) {
throw std::invalid_argument("'Refusal' requires 'refusal'");
// Ignore annotations and logprobs for now
chatcmpl_content.push_back({
{"refusal", output_text.at("refusal")},
{"type", "refusal"},
});
}
} else {
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
if (type != "output_text" && type != "input_text") {
throw std::invalid_argument("'type' must be 'output_text' or 'input_text'");
}
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}
if (merge_prev) {
@ -1303,7 +1312,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
prev_msg["content"] = json::array();
}
auto & prev_content = prev_msg["content"];
prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
for (const auto & part : chatcmpl_content) {
prev_content.push_back(part);
}
} else {
item.erase("status");
item.erase("type");
@ -1407,11 +1418,17 @@ json convert_responses_to_chatcmpl(const json & response_body) {
}
std::vector<json> chatcmpl_tools;
for (json resp_tool : response_body.at("tools")) {
json chatcmpl_tool;
const std::string tool_type = json_value(resp_tool, "type", std::string());
if (json_value(resp_tool, "type", std::string()) != "function") {
throw std::invalid_argument("'type' of tool must be 'function'");
// Skip non-function tools (e.g. web_search, code_interpreter)
// sent by clients like Codex CLI — these are provider-specific
// and cannot be converted to chat completions function tools
if (tool_type != "function") {
SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str());
continue;
}
json chatcmpl_tool;
resp_tool.erase("type");
chatcmpl_tool["type"] = "function";
@ -1422,7 +1439,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
chatcmpl_tools.push_back(chatcmpl_tool);
}
chatcmpl_body.erase("tools");
chatcmpl_body["tools"] = chatcmpl_tools;
if (!chatcmpl_tools.empty()) {
chatcmpl_body["tools"] = chatcmpl_tools;
}
}
if (response_body.contains("max_output_tokens")) {
@ -1430,6 +1449,15 @@ json convert_responses_to_chatcmpl(const json & response_body) {
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
}
// Strip Responses-only keys that have no chat completions equivalent
// (e.g. Codex CLI sends store, include, prompt_cache_key, web_search)
for (const char * key : {
"store", "include", "prompt_cache_key", "web_search",
"text", "truncation", "metadata",
}) {
chatcmpl_body.erase(key);
}
return chatcmpl_body;
}

View File

@ -960,28 +960,66 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
output.push_back(json {
{"type", "function_call"},
{"status", "completed"},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"id", "fc_" + random_string()},
{"call_id", tool_call.id},
{"name", tool_call.name},
{"arguments", tool_call.arguments},
{"status", "completed"},
});
}
// Build output_text convenience field (concatenation of all output_text parts)
std::string output_text;
for (const auto & item : output) {
if (json_value(item, "type", std::string()) == "message") {
for (const auto & part : item.at("content")) {
if (json_value(part, "type", std::string()) == "output_text") {
output_text += part.at("text").get<std::string>();
}
}
}
}
std::time_t t = std::time(0);
json res = {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"output_text", output_text},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
}},
{"incomplete_details", nullptr},
{"previous_response_id", nullptr},
{"instructions", nullptr},
{"error", nullptr},
{"tools", json::array()},
{"tool_choice", "auto"},
{"truncation", "disabled"},
{"parallel_tool_calls", false},
{"text", json{{"format", json{{"type", "text"}}}}},
{"top_p", 1.0},
{"presence_penalty", 0.0},
{"frequency_penalty", 0.0},
{"top_logprobs", 0},
{"temperature", 1.0},
{"reasoning", nullptr},
{"max_output_tokens", nullptr},
{"max_tool_calls", nullptr},
{"store", false},
{"background", false},
{"service_tier", "default"},
{"safety_identifier", nullptr},
{"prompt_cache_key", nullptr},
{"metadata", json::object()},
};
return res;
@ -990,6 +1028,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
std::vector<json> server_sent_events;
std::vector<json> output;
int & seq_num = oai_resp_seq_num;
if (oaicompat_msg.reasoning_content != "") {
const json output_item = json {
@ -1006,8 +1045,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"item", output_item},
}}
});
output.push_back(output_item);
@ -1017,9 +1058,13 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
server_sent_events.push_back(json {
{"event", "response.output_text.done"},
{"data", json {
{"type", "response.output_text.done"},
{"item_id", oai_resp_message_id},
{"text", oaicompat_msg.content}
{"type", "response.output_text.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"content_index", 0},
{"item_id", oai_resp_message_id},
{"text", oaicompat_msg.content},
{"logprobs", json::array()},
}}
});
@ -1033,9 +1078,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
server_sent_events.push_back(json {
{"event", "response.content_part.done"},
{"data", json {
{"type", "response.content_part.done"},
{"item_id", oai_resp_message_id},
{"part", content_part}
{"type", "response.content_part.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"content_index", 0},
{"item_id", oai_resp_message_id},
{"part", content_part},
}}
});
const json output_item = {
@ -1049,8 +1097,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"item", output_item},
}}
});
output.push_back(output_item);
@ -1059,39 +1109,81 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
const json output_item = {
{"type", "function_call"},
{"status", "completed"},
{"id", "fc_" + random_string()},
{"call_id", tool_call.id},
{"name", tool_call.name},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"name", tool_call.name}
{"status", "completed"},
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"item", output_item},
}}
});
output.push_back(output_item);
}
// Build output_text convenience field for streaming final event
std::string output_text_stream;
for (const auto & item : output) {
if (json_value(item, "type", std::string()) == "message") {
for (const auto & part : item.at("content")) {
if (json_value(part, "type", std::string()) == "output_text") {
output_text_stream += part.at("text").get<std::string>();
}
}
}
}
std::time_t t = std::time(0);
server_sent_events.push_back(json {
{"event", "response.completed"},
{"data", json {
{"type", "response.completed"},
{"type", "response.completed"},
{"sequence_number", seq_num++},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"created_at", t},
{"status", "completed"},
{"model", oaicompat_model},
{"output", output},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}}
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"object", "response"},
{"status", "completed"},
{"model", oaicompat_model},
{"output", output},
{"output_text", output_text_stream},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
}},
{"incomplete_details", nullptr},
{"previous_response_id", nullptr},
{"instructions", nullptr},
{"error", nullptr},
{"tools", json::array()},
{"tool_choice", "auto"},
{"truncation", "disabled"},
{"parallel_tool_calls", false},
{"text", json{{"format", json{{"type", "text"}}}}},
{"top_p", 1.0},
{"presence_penalty", 0.0},
{"frequency_penalty", 0.0},
{"top_logprobs", 0},
{"temperature", 1.0},
{"reasoning", nullptr},
{"max_output_tokens", nullptr},
{"max_tool_calls", nullptr},
{"store", false},
{"background", false},
{"service_tier", "default"},
{"safety_identifier", nullptr},
{"prompt_cache_key", nullptr},
{"metadata", json::object()},
}},
}}
});

View File

@ -370,6 +370,7 @@ struct server_task_result_cmpl_final : server_task_result {
std::string oai_resp_id;
std::string oai_resp_reasoning_id;
std::string oai_resp_message_id;
int oai_resp_seq_num = 0;
virtual bool is_stop() override {
return true; // in stream mode, final responses are considered stop