server: improve Responses API compliance and Codex CLI compatibility
Codex CLI compatibility: - Skip non-function tool types (web_search, code_interpreter) - Merge developer/system messages into position 0 for Qwen templates - Strip Responses-only request keys (store, include, prompt_cache_key) - output_text convenience field in streaming and non-streaming responses Responses API compliance (ideas from #19720 by riskywindow, adapted): - Add 24 missing Response object fields per OpenAI spec - Fix function_call id/call_id field mapping - Add sequence_number, output_index, content_index to streaming events - Accept input_text type and EasyInputMessage for multi-turn input Verified: codex -p local and codex -p fast work against local llama.cpp with Qwen3.5 models including native tool calling. Refs: ggml-org/llama.cpp#19138, ggml-org/llama.cpp#19720
This commit is contained in:
parent
7c203670f8
commit
302c3c8f61
|
|
@ -1255,6 +1255,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
if (item.contains("status")) {
|
||||
item.erase("status");
|
||||
}
|
||||
// Merge system/developer messages into the first system message.
|
||||
// Many model templates (e.g. Qwen) require all system content at
|
||||
// position 0 and reject system messages elsewhere in the conversation.
|
||||
if (item.at("role") == "system" || item.at("role") == "developer") {
|
||||
if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") {
|
||||
auto & first_msg = chatcmpl_messages[0];
|
||||
// Convert string content to array format if needed
|
||||
if (first_msg["content"].is_string()) {
|
||||
std::string old_text = first_msg["content"].get<std::string>();
|
||||
first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}});
|
||||
}
|
||||
auto & first_content = first_msg["content"];
|
||||
for (const auto & part : chatcmpl_content) {
|
||||
first_content.push_back(part);
|
||||
}
|
||||
continue; // merged, don't push a separate message
|
||||
}
|
||||
item["role"] = "system";
|
||||
}
|
||||
item["content"] = chatcmpl_content;
|
||||
|
||||
chatcmpl_messages.push_back(item);
|
||||
|
|
@ -1266,35 +1285,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
// item.at("status") == "completed" ||
|
||||
// item.at("status") == "incomplete") &&
|
||||
// item["status"] not sent by codex-cli
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "message"
|
||||
// item["type"] == "message" for OutputMessage, absent for EasyInputMessage
|
||||
(!item.contains("type") || item.at("type") == "message")
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-output_message
|
||||
auto chatcmpl_content = json::array();
|
||||
// Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant"
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const auto & output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type == "output_text") {
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
} else if (type == "refusal") {
|
||||
if (!exists_and_is_string(output_text, "refusal")) {
|
||||
throw std::invalid_argument("'Refusal' requires 'refusal'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"refusal", output_text.at("refusal")},
|
||||
{"type", "refusal"},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
|
||||
if (type != "output_text" && type != "input_text") {
|
||||
throw std::invalid_argument("'type' must be 'output_text' or 'input_text'");
|
||||
}
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
|
|
@ -1303,7 +1312,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
prev_msg["content"] = json::array();
|
||||
}
|
||||
auto & prev_content = prev_msg["content"];
|
||||
prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
|
||||
for (const auto & part : chatcmpl_content) {
|
||||
prev_content.push_back(part);
|
||||
}
|
||||
} else {
|
||||
item.erase("status");
|
||||
item.erase("type");
|
||||
|
|
@ -1407,11 +1418,17 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
}
|
||||
std::vector<json> chatcmpl_tools;
|
||||
for (json resp_tool : response_body.at("tools")) {
|
||||
json chatcmpl_tool;
|
||||
const std::string tool_type = json_value(resp_tool, "type", std::string());
|
||||
|
||||
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||
throw std::invalid_argument("'type' of tool must be 'function'");
|
||||
// Skip non-function tools (e.g. web_search, code_interpreter)
|
||||
// sent by clients like Codex CLI — these are provider-specific
|
||||
// and cannot be converted to chat completions function tools
|
||||
if (tool_type != "function") {
|
||||
SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
json chatcmpl_tool;
|
||||
resp_tool.erase("type");
|
||||
chatcmpl_tool["type"] = "function";
|
||||
|
||||
|
|
@ -1422,7 +1439,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||
}
|
||||
chatcmpl_body.erase("tools");
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
if (!chatcmpl_tools.empty()) {
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
}
|
||||
}
|
||||
|
||||
if (response_body.contains("max_output_tokens")) {
|
||||
|
|
@ -1430,6 +1449,15 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
|||
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||
}
|
||||
|
||||
// Strip Responses-only keys that have no chat completions equivalent
|
||||
// (e.g. Codex CLI sends store, include, prompt_cache_key, web_search)
|
||||
for (const char * key : {
|
||||
"store", "include", "prompt_cache_key", "web_search",
|
||||
"text", "truncation", "metadata",
|
||||
}) {
|
||||
chatcmpl_body.erase(key);
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -960,28 +960,66 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
|||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
output.push_back(json {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"id", "fc_" + random_string()},
|
||||
{"call_id", tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"status", "completed"},
|
||||
});
|
||||
}
|
||||
|
||||
// Build output_text convenience field (concatenation of all output_text parts)
|
||||
std::string output_text;
|
||||
for (const auto & item : output) {
|
||||
if (json_value(item, "type", std::string()) == "message") {
|
||||
for (const auto & part : item.at("content")) {
|
||||
if (json_value(part, "type", std::string()) == "output_text") {
|
||||
output_text += part.at("text").get<std::string>();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
json res = {
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "response"},
|
||||
{"output", output},
|
||||
{"status", "completed"},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "response"},
|
||||
{"output", output},
|
||||
{"output_text", output_text},
|
||||
{"status", "completed"},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
|
||||
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
|
||||
}},
|
||||
{"incomplete_details", nullptr},
|
||||
{"previous_response_id", nullptr},
|
||||
{"instructions", nullptr},
|
||||
{"error", nullptr},
|
||||
{"tools", json::array()},
|
||||
{"tool_choice", "auto"},
|
||||
{"truncation", "disabled"},
|
||||
{"parallel_tool_calls", false},
|
||||
{"text", json{{"format", json{{"type", "text"}}}}},
|
||||
{"top_p", 1.0},
|
||||
{"presence_penalty", 0.0},
|
||||
{"frequency_penalty", 0.0},
|
||||
{"top_logprobs", 0},
|
||||
{"temperature", 1.0},
|
||||
{"reasoning", nullptr},
|
||||
{"max_output_tokens", nullptr},
|
||||
{"max_tool_calls", nullptr},
|
||||
{"store", false},
|
||||
{"background", false},
|
||||
{"service_tier", "default"},
|
||||
{"safety_identifier", nullptr},
|
||||
{"prompt_cache_key", nullptr},
|
||||
{"metadata", json::object()},
|
||||
};
|
||||
|
||||
return res;
|
||||
|
|
@ -990,6 +1028,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
|||
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||
std::vector<json> server_sent_events;
|
||||
std::vector<json> output;
|
||||
int & seq_num = oai_resp_seq_num;
|
||||
|
||||
if (oaicompat_msg.reasoning_content != "") {
|
||||
const json output_item = json {
|
||||
|
|
@ -1006,8 +1045,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
{"type", "response.output_item.done"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"output_index", 0},
|
||||
{"item", output_item},
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
|
|
@ -1017,9 +1058,13 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_text.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"text", oaicompat_msg.content}
|
||||
{"type", "response.output_text.done"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"output_index", 0},
|
||||
{"content_index", 0},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"text", oaicompat_msg.content},
|
||||
{"logprobs", json::array()},
|
||||
}}
|
||||
});
|
||||
|
||||
|
|
@ -1033,9 +1078,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
server_sent_events.push_back(json {
|
||||
{"event", "response.content_part.done"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", content_part}
|
||||
{"type", "response.content_part.done"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"output_index", 0},
|
||||
{"content_index", 0},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", content_part},
|
||||
}}
|
||||
});
|
||||
const json output_item = {
|
||||
|
|
@ -1049,8 +1097,10 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
{"type", "response.output_item.done"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"output_index", 0},
|
||||
{"item", output_item},
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
|
|
@ -1059,39 +1109,81 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||
const json output_item = {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"id", "fc_" + random_string()},
|
||||
{"call_id", tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name}
|
||||
{"status", "completed"},
|
||||
};
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item}
|
||||
{"type", "response.output_item.done"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"output_index", 0},
|
||||
{"item", output_item},
|
||||
}}
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
// Build output_text convenience field for streaming final event
|
||||
std::string output_text_stream;
|
||||
for (const auto & item : output) {
|
||||
if (json_value(item, "type", std::string()) == "message") {
|
||||
for (const auto & part : item.at("content")) {
|
||||
if (json_value(part, "type", std::string()) == "output_text") {
|
||||
output_text_stream += part.at("text").get<std::string>();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.completed"},
|
||||
{"data", json {
|
||||
{"type", "response.completed"},
|
||||
{"type", "response.completed"},
|
||||
{"sequence_number", seq_num++},
|
||||
{"response", json {
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", output},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
|
||||
}}
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", output},
|
||||
{"output_text", output_text_stream},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
|
||||
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
|
||||
}},
|
||||
{"incomplete_details", nullptr},
|
||||
{"previous_response_id", nullptr},
|
||||
{"instructions", nullptr},
|
||||
{"error", nullptr},
|
||||
{"tools", json::array()},
|
||||
{"tool_choice", "auto"},
|
||||
{"truncation", "disabled"},
|
||||
{"parallel_tool_calls", false},
|
||||
{"text", json{{"format", json{{"type", "text"}}}}},
|
||||
{"top_p", 1.0},
|
||||
{"presence_penalty", 0.0},
|
||||
{"frequency_penalty", 0.0},
|
||||
{"top_logprobs", 0},
|
||||
{"temperature", 1.0},
|
||||
{"reasoning", nullptr},
|
||||
{"max_output_tokens", nullptr},
|
||||
{"max_tool_calls", nullptr},
|
||||
{"store", false},
|
||||
{"background", false},
|
||||
{"service_tier", "default"},
|
||||
{"safety_identifier", nullptr},
|
||||
{"prompt_cache_key", nullptr},
|
||||
{"metadata", json::object()},
|
||||
}},
|
||||
}}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -370,6 +370,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
int oai_resp_seq_num = 0;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return true; // in stream mode, final responses are considered stop
|
||||
|
|
|
|||
Loading…
Reference in New Issue