From fbbf3ad1900bbaa97cd3c8de4c764afb0f6d8972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=ED=9D=AC=EC=A4=80?= Date: Thu, 22 Jan 2026 01:47:23 +0900 Subject: [PATCH] server: /v1/responses (partial) (#18486) * from previous PR * Make instruction(system) as first message * Convert [input_message] (text/image/file) * Rename convert_responses_to_chatcmpl(body) -> response_body * Initial tool call support * Erase instructions field from chatcmpl body * Feed reasoning texts to chat template * Use std::vector instead of opaque json array * Make output_item.added events consistent * Move `server_task_result_cmpl_partial::update` from header to source * Match ID of output_item.added and .done events * Add function_call only if there is no "fc_" prefix * Add function call output at non-streaming API * Test if ID is persistent * Add doc * Fix style - use trailing comma * Rewrite state management * catch up with upstream/master * Fix style - "type" is the first item of SSE data * Explicitly check "instructions" from response_body * Make lambdas static * Check if reasoning content exists * Add `oai_resp_id` to task_result_state(also initialized at ctor), server_task_result_cmpl_partial, and server_task_result_cmpl_final * Reject `input_file` since it is not supported by chatcmpl * Add "fc_" prefix to non-straming function call id as coderabbit pointed out --------- Co-authored-by: openingnow <> --- requirements/requirements-tool_bench.txt | 2 +- tools/server/README.md | 45 ++- tools/server/server-common.cpp | 295 +++++++++++++++ tools/server/server-common.h | 5 + tools/server/server-context.cpp | 37 +- tools/server/server-context.h | 1 + tools/server/server-task.cpp | 344 +++++++++++++++++- tools/server/server-task.h | 70 ++-- tools/server/server.cpp | 2 + tools/server/tests/requirements.txt | 2 +- .../tests/unit/test_compat_oai_responses.py | 73 ++++ 11 files changed, 836 insertions(+), 40 deletions(-) create mode 100644 tools/server/tests/unit/test_compat_oai_responses.py diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index f7912aff72..3bb74fb9d0 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -3,7 +3,7 @@ pytest~=8.3.3 huggingface_hub>=0.34.0,<1.0 matplotlib~=3.10.0 numpy~=1.26.4 -openai~=1.55.3 +openai~=2.14.0 pandas~=2.2.3 prometheus-client~=0.20.0 requests~=2.32.3 diff --git a/tools/server/README.md b/tools/server/README.md index 9fe8938768..191391a882 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -6,7 +6,7 @@ Set of LLM REST APIs and a web UI to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU - * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes + * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes * [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions * Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510) * Parallel decoding with multi-user support @@ -1267,6 +1267,49 @@ This provides information on the performance of the server. It also allows calcu The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` +### POST `/v1/responses`: OpenAI-compatible Responses API + +*Options:* + +See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses). + +*Examples:* + +You can use either Python `openai` library with appropriate checkpoints: + +```python +import openai + +client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" +) + +response = client.responses.create( + model="gpt-4.1", + instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.", + input="Write a limerick about python exceptions" +) + +print(response.output_text) +``` + +... or raw HTTP requests: + +```shell +curl http://localhost:8080/v1/responses \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer no-key" \ +-d '{ +"model": "gpt-4.1", +"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.", +"input": "Write a limerick about python exceptions" +}' +``` + +This endpoint works by converting Responses request into Chat Completions request. + + ### POST `/v1/embeddings`: OpenAI-compatible embeddings API This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 4aeeda2ffe..a853f65c8d 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1069,6 +1069,283 @@ json oaicompat_chat_params_parse( return llama_params; } +json convert_responses_to_chatcmpl(const json & response_body) { + if (!response_body.contains("input")) { + throw std::invalid_argument("'input' is required"); + } + if (!json_value(response_body, "previous_response_id", std::string{}).empty()) { + throw std::invalid_argument("llama.cpp does not support 'previous_response_id'."); + } + + const json input_value = response_body.at("input"); + json chatcmpl_body = response_body; + chatcmpl_body.erase("input"); + std::vector chatcmpl_messages; + + if (response_body.contains("instructions")) { + chatcmpl_messages.push_back({ + {"role", "system"}, + {"content", json_value(response_body, "instructions", std::string())}, + }); + chatcmpl_body.erase("instructions"); + } + + if (input_value.is_string()) { + // #responses_create-input-text_input + chatcmpl_messages.push_back({ + {"role", "user"}, + {"content", input_value}, + }); + } else if (input_value.is_array()) { + // #responses_create-input-input_item_list + + static auto exists_and_is_array = [](const json & j, const char * key) -> bool { + return j.contains(key) && j.at(key).is_array(); + }; + static auto exists_and_is_string = [](const json & j, const char * key) -> bool { + return j.contains(key) && j.at(key).is_string(); + }; + + for (json item : input_value) { + if (exists_and_is_string(item, "content")) { + // #responses_create-input-input_item_list-input_message-content-text_input + // Only "Input message" contains item["content"]::string + // After converting item["content"]::string to item["content"]::array, + // we can treat "Input message" as sum of "Item-Input message" and "Item-Output message" + item["content"] = json::array({ + json { + {"text", item.at("content")}, + {"type", "input_text"} + } + }); + } + + if (exists_and_is_array(item, "content") && + exists_and_is_string(item, "role") && + (item.at("role") == "user" || + item.at("role") == "system" || + item.at("role") == "developer") + ) { + // #responses_create-input-input_item_list-item-input_message + std::vector chatcmpl_content; + + for (const json & input_item : item.at("content")) { + const std::string type = json_value(input_item, "type", std::string()); + + if (type == "input_text") { + if (!input_item.contains("text")) { + throw std::invalid_argument("'Input text' requires 'text'"); + } + chatcmpl_content.push_back({ + {"text", input_item.at("text")}, + {"type", "text"}, + }); + } else if (type == "input_image") { + // While `detail` is marked as required, + // it has default value("auto") and can be omitted. + + if (!input_item.contains("image_url")) { + throw std::invalid_argument("'image_url' is required"); + } + chatcmpl_content.push_back({ + {"image_url", json { + {"url", input_item.at("image_url")} + }}, + {"type", "image_url"}, + }); + } else if (type == "input_file") { + throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment"); + // if (input_item.contains("file_url")) { + // // chat completion API does not support file_url + // throw std::invalid_argument("'file_url' is not supported"); + // } + // if (!input_item.contains("file_data") || !input_item.contains("filename")) { + // throw std::invalid_argument("Both 'file_data' and 'filename' are required"); + // } + // chatcmpl_content.push_back({ + // {"file", json { + // {"file_data", input_item.at("file_data")}, + // {"filename", input_item.at("filename")}, + // }}, + // {"type", "file"}, + // }); + } else { + throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'"); + } + } + + if (item.contains("type")) { + item.erase("type"); + } + if (item.contains("status")) { + item.erase("status"); + } + item["content"] = chatcmpl_content; + + chatcmpl_messages.push_back(item); + } else if (exists_and_is_array(item, "content") && + exists_and_is_string(item, "role") && + item.at("role") == "assistant" && + // exists_and_is_string(item, "status") && + // (item.at("status") == "in_progress" || + // item.at("status") == "completed" || + // item.at("status") == "incomplete") && + // item["status"] not sent by codex-cli + exists_and_is_string(item, "type") && + item.at("type") == "message" + ) { + // #responses_create-input-input_item_list-item-output_message + std::vector chatcmpl_content; + + for (const auto & output_text : item.at("content")) { + const std::string type = json_value(output_text, "type", std::string()); + if (type != "output_text") { + throw std::invalid_argument("'type' must be 'output_text'"); + } + if (!exists_and_is_string(output_text, "text")) { + throw std::invalid_argument("'Output text' requires 'text'"); + } + // Ignore annotations and logprobs for now + chatcmpl_content.push_back({ + {"text", output_text.at("text")}, + {"type", "text"}, + }); + } + + item.erase("status"); + item.erase("type"); + item["content"] = chatcmpl_content; + chatcmpl_messages.push_back(item); + } else if (exists_and_is_string(item, "arguments") && + exists_and_is_string(item, "call_id") && + exists_and_is_string(item, "name") && + exists_and_is_string(item, "type") && + item.at("type") == "function_call" + ) { + // #responses_create-input-input_item_list-item-function_tool_call + json msg = json { + {"role", "assistant"}, + {"tool_calls", json::array({ json { + {"function", json { + {"arguments", item.at("arguments")}, + {"name", item.at("name")}, + }}, + {"id", item.at("call_id")}, + {"type", "function"}, + }})}, + }; + + if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) { + // Move reasoning content from dummy message to tool call message + msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content"); + chatcmpl_messages.pop_back(); + } + chatcmpl_messages.push_back(msg); + } else if (exists_and_is_string(item, "call_id") && + (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) && + exists_and_is_string(item, "type") && + item.at("type") == "function_call_output" + ) { + // #responses_create-input-input_item_list-item-function_tool_call_output + if (item.at("output").is_string()) { + chatcmpl_messages.push_back(json { + {"content", item.at("output")}, + {"role", "tool"}, + {"tool_call_id", item.at("call_id")}, + }); + } else { + json chatcmpl_outputs = item.at("output"); + for (json & chatcmpl_output : chatcmpl_outputs) { + if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") { + throw std::invalid_argument("Output of tool call should be 'Input text'"); + } + chatcmpl_output["type"] = "text"; + } + chatcmpl_messages.push_back(json { + {"content", chatcmpl_outputs}, + {"role", "tool"}, + {"tool_call_id", item.at("call_id")}, + }); + } + } else if (// exists_and_is_string(item, "id") && + // item["id"] not sent by codex-cli + exists_and_is_array(item, "summary") && + exists_and_is_string(item, "type") && + item.at("type") == "reasoning") { + // #responses_create-input-input_item_list-item-reasoning + + if (!exists_and_is_array(item, "content")) { + throw std::invalid_argument("item['content'] is not an array"); + } + if (item.at("content").empty()) { + throw std::invalid_argument("item['content'] is empty"); + } + if (!exists_and_is_string(item.at("content")[0], "text")) { + throw std::invalid_argument("item['content']['text'] is not a string"); + } + + // Pack reasoning content in dummy message + chatcmpl_messages.push_back(json { + {"role", "assistant"}, + {"content", json::array()}, + {"reasoning_content", item.at("content")[0].at("text")}, + }); + } else { + throw std::invalid_argument("Cannot determine type of 'item'"); + } + } + } else { + throw std::invalid_argument("'input' must be a string or array of objects"); + } + + // Remove unused dummy message which contains + // reasoning content not followed by tool call + chatcmpl_messages.erase(std::remove_if( + chatcmpl_messages.begin(), + chatcmpl_messages.end(), + [](const json & x){ return x.contains("role") && + x.at("role") == "assistant" && + x.contains("content") && + x.at("content") == json::array() && + x.contains("reasoning_content"); + }), + chatcmpl_messages.end() + ); + + chatcmpl_body["messages"] = chatcmpl_messages; + + if (response_body.contains("tools")) { + if (!response_body.at("tools").is_array()) { + throw std::invalid_argument("'tools' must be an array of objects"); + } + std::vector chatcmpl_tools; + for (json resp_tool : response_body.at("tools")) { + json chatcmpl_tool; + + if (json_value(resp_tool, "type", std::string()) != "function") { + throw std::invalid_argument("'type' of tool must be 'function'"); + } + resp_tool.erase("type"); + chatcmpl_tool["type"] = "function"; + + if (!resp_tool.contains("strict")) { + resp_tool["strict"] = true; + } + chatcmpl_tool["function"] = resp_tool; + chatcmpl_tools.push_back(chatcmpl_tool); + } + chatcmpl_body.erase("tools"); + chatcmpl_body["tools"] = chatcmpl_tools; + } + + if (response_body.contains("max_output_tokens")) { + chatcmpl_body.erase("max_output_tokens"); + chatcmpl_body["max_tokens"] = response_body["max_output_tokens"]; + } + + return chatcmpl_body; +} + json convert_anthropic_to_oai(const json & body) { json oai_body; @@ -1482,6 +1759,24 @@ std::string format_oai_sse(const json & data) { return ss.str(); } +std::string format_oai_resp_sse(const json & data) { + std::ostringstream ss; + auto send_single = [&ss](const json & event_obj) { + ss << "event: " << event_obj.at("event").get() << "\n"; + ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; + }; + + if (data.is_array()) { + for (const auto & item : data) { + send_single(item); + } + } else { + send_single(data); + } + + return ss.str(); +} + std::string format_anthropic_sse(const json & data) { std::ostringstream ss; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index a88d40494a..2629a6bee9 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -294,6 +294,9 @@ json oaicompat_chat_params_parse( const server_chat_params & opt, std::vector & out_files); +// convert OpenAI Responses API format to OpenAI Chat Completions API format +json convert_responses_to_chatcmpl(const json & body); + // convert Anthropic Messages API format to OpenAI Chat Completions API format json convert_anthropic_to_oai(const json & body); @@ -331,6 +334,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l // note: if data is a json array, it will be sent as multiple events, one per item std::string format_oai_sse(const json & data); +std::string format_oai_resp_sse(const json & data); + // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f1f677addd..9a828e1eff 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3073,6 +3073,8 @@ std::unique_ptr server_routes::handle_completions_impl( json first_result_json = first_result->to_json(); if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { res->data = format_anthropic_sse(first_result_json); + } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { + res->data = format_oai_resp_sse(first_result_json); } else { res->data = format_oai_sse(first_result_json); } @@ -3107,13 +3109,16 @@ std::unique_ptr server_routes::handle_completions_impl( // check if there is more data if (!rd.has_next()) { - if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { - // Anthropic doesn't send [DONE], message_stop was already sent - output = ""; - } else if (res_type != TASK_RESPONSE_TYPE_NONE) { - output = "data: [DONE]\n\n"; - } else { - output = ""; + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: + case TASK_RESPONSE_TYPE_OAI_RESP: + case TASK_RESPONSE_TYPE_ANTHROPIC: + output = ""; + break; + + default: + output = "data: [DONE]\n\n"; + break; } SRV_DBG("%s", "all results received, terminating stream\n"); return false; // no more data, terminate @@ -3141,6 +3146,8 @@ std::unique_ptr server_routes::handle_completions_impl( json res_json = result->to_json(); if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { output = format_anthropic_sse(res_json); + } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { + output = format_oai_resp_sse(res_json); } else { output = format_oai_sse(res_json); } @@ -3575,6 +3582,22 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_responses_oai = [this](const server_http_req & req) { + auto res = create_response(); + std::vector files; + json body = convert_responses_to_chatcmpl(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( + body, + meta->chat_params, + files); + return handle_completions_impl( + req, + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + TASK_RESPONSE_TYPE_OAI_RESP); + }; + this->post_anthropic_messages = [this](const server_http_req & req) { auto res = create_response(); std::vector files; diff --git a/tools/server/server-context.h b/tools/server/server-context.h index ec1df96950..3e5e870fc5 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -94,6 +94,7 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_responses_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2add9667d1..c098b3008a 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -584,6 +584,8 @@ json server_task_result_cmpl_final::to_json() { return to_json_oaicompat(); case TASK_RESPONSE_TYPE_OAI_CHAT: return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_OAI_RESP: + return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp(); case TASK_RESPONSE_TYPE_ANTHROPIC: return stream ? to_json_anthropic_stream() : to_json_anthropic(); default: @@ -801,6 +803,186 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { return deltas; } +json server_task_result_cmpl_final::to_json_oaicompat_resp() { + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } else { + msg.role = "assistant"; + msg.content = content; + } + + std::vector output; + + if (msg.reasoning_content != "") { + output.push_back(json { + {"id", "rs_" + random_string()}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array({ json { + {"text", msg.reasoning_content}, + {"type", "reasoning_text"}, + }})}, + {"encrypted_content", ""}, + {"status", "completed"}, + }); + } + + if (msg.content != "") { + output.push_back(json { + {"content", json::array({ json { + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", msg.content}, + }})}, + {"id", "msg_" + random_string()}, + {"role", msg.role}, + {"status", "completed"}, + {"type", "message"}, + }); + } + + for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { + output.push_back(json { + {"type", "function_call"}, + {"status", "completed"}, + {"arguments", tool_call.arguments}, + {"call_id", "fc_" + tool_call.id}, + {"name", tool_call.name}, + }); + } + + std::time_t t = std::time(0); + json res = { + {"completed_at", t}, + {"created_at", t}, + {"id", oai_resp_id}, + {"model", oaicompat_model}, + {"object", "response"}, + {"output", output}, + {"status", "completed"}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }; + + return res; +} + +json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { + std::vector server_sent_events; + std::vector output; + + if (oaicompat_msg.reasoning_content != "") { + const json output_item = json { + {"id", oai_resp_reasoning_id}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array({ json { + {"text", oaicompat_msg.reasoning_content}, + {"type", "reasoning_text"}, + }})}, + {"encrypted_content", ""}, + }; + + server_sent_events.push_back(json { + {"event", "response.output_item.done"}, + {"data", json { + {"type", "response.output_item.done"}, + {"item", output_item} + }} + }); + output.push_back(output_item); + } + + if (oaicompat_msg.content != "") { + server_sent_events.push_back(json { + {"event", "response.output_text.done"}, + {"data", json { + {"type", "response.output_text.done"}, + {"item_id", oai_resp_message_id}, + {"text", oaicompat_msg.content} + }} + }); + + const json content_part = { + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", oaicompat_msg.content} + }; + + server_sent_events.push_back(json { + {"event", "response.content_part.done"}, + {"data", json { + {"type", "response.content_part.done"}, + {"item_id", oai_resp_message_id}, + {"part", content_part} + }} + }); + const json output_item = { + {"type", "message"}, + {"status", "completed"}, + {"id", oai_resp_message_id}, + {"content", json::array({content_part})}, + {"role", "assistant"} + }; + + server_sent_events.push_back(json { + {"event", "response.output_item.done"}, + {"data", json { + {"type", "response.output_item.done"}, + {"item", output_item} + }} + }); + output.push_back(output_item); + } + + for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { + const json output_item = { + {"type", "function_call"}, + {"status", "completed"}, + {"arguments", tool_call.arguments}, + {"call_id", "fc_" + tool_call.id}, + {"name", tool_call.name} + }; + server_sent_events.push_back(json { + {"event", "response.output_item.done"}, + {"data", json { + {"type", "response.output_item.done"}, + {"item", output_item} + }} + }); + output.push_back(output_item); + } + + std::time_t t = std::time(0); + server_sent_events.push_back(json { + {"event", "response.completed"}, + {"data", json { + {"type", "response.completed"}, + {"response", json { + {"id", oai_resp_id}, + {"object", "response"}, + {"created_at", t}, + {"status", "completed"}, + {"model", oaicompat_model}, + {"output", output}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens} + }} + }}, + }} + }); + + return server_sent_events; +} + json server_task_result_cmpl_final::to_json_anthropic() { std::string stop_reason = "max_tokens"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { @@ -1057,6 +1239,36 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() { // // server_task_result_cmpl_partial // +void server_task_result_cmpl_partial::update(task_result_state & state) { + is_updated = true; + state.update_chat_msg(content, true, oaicompat_msg_diffs); + + // Copy current state for use in to_json_*() (reflects state BEFORE this chunk) + thinking_block_started = state.thinking_block_started; + text_block_started = state.text_block_started; + + oai_resp_id = state.oai_resp_id; + oai_resp_reasoning_id = state.oai_resp_reasoning_id; + oai_resp_message_id = state.oai_resp_message_id; + oai_resp_fc_id = state.oai_resp_fc_id; + + // track if the accumulated message has any reasoning content + anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty(); + + // Pre-compute state updates based on diffs (for next chunk) + for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) { + if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) { + state.thinking_block_started = true; + } + if (!diff.content_delta.empty() && !state.text_block_started) { + state.text_block_started = true; + } + if (!diff.tool_call_delta.name.empty()) { + state.oai_resp_fc_id = diff.tool_call_delta.id; + } + } +} + json server_task_result_cmpl_partial::to_json() { GGML_ASSERT(is_updated && "update() must be called before to_json()"); switch (res_type) { @@ -1066,6 +1278,8 @@ json server_task_result_cmpl_partial::to_json() { return to_json_oaicompat(); case TASK_RESPONSE_TYPE_OAI_CHAT: return to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_OAI_RESP: + return to_json_oaicompat_resp(); case TASK_RESPONSE_TYPE_ANTHROPIC: return to_json_anthropic(); default: @@ -1190,6 +1404,132 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() { return deltas; } +json server_task_result_cmpl_partial::to_json_oaicompat_resp() { + std::vector events; + + if (n_decoded == 1) { + events.push_back(json { + {"event", "response.created"}, + {"data", json { + {"type", "response.created"}, + {"response", json { + {"id", oai_resp_id}, + {"object", "response"}, + {"status", "in_progress"}, + }}, + }}, + }); + events.push_back(json { + {"event", "response.in_progress"}, + {"data", json { + {"type", "response.in_progress"}, + {"response", json { + {"id", oai_resp_id}, + {"object", "response"}, + {"status", "in_progress"}, + }}, + }}, + }); + } + + for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) { + if (!diff.reasoning_content_delta.empty()) { + if (!thinking_block_started) { + events.push_back(json { + {"event", "response.output_item.added"}, + {"data", json { + {"type", "response.output_item.added"}, + {"item", json { + {"id", oai_resp_reasoning_id}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array()}, + {"encrypted_content", ""}, + {"status", "in_progress"}, + }}, + }}, + }); + thinking_block_started = true; + } + events.push_back(json { + {"event", "response.reasoning_text.delta"}, + {"data", json { + {"type", "response.reasoning_text.delta"}, + {"delta", diff.reasoning_content_delta}, + {"item_id", oai_resp_reasoning_id}, + }}, + }); + } + + if (!diff.content_delta.empty()) { + if (!text_block_started) { + events.push_back(json { + {"event", "response.output_item.added"}, + {"data", json { + {"type", "response.output_item.added"}, + {"item", json { + {"content", json::array()}, + {"id", oai_resp_message_id}, + {"role", "assistant"}, + {"status", "in_progress"}, + {"type", "message"}, + }}, + }}, + }); + events.push_back(json { + {"event", "response.content_part.added"}, + {"data", json { + {"type", "response.content_part.added"}, + {"item_id", oai_resp_message_id}, + {"part", json { + {"type", "output_text"}, + {"text", ""}, + }}, + }}, + }); + text_block_started = true; + } + events.push_back(json { + {"event", "response.output_text.delta"}, + {"data", json { + {"type", "response.output_text.delta"}, + {"item_id", oai_resp_message_id}, + {"delta", diff.content_delta}, + }}, + }); + } + + if (!diff.tool_call_delta.name.empty()) { + events.push_back(json { + {"event", "response.output_item.added"}, + {"data", json { + {"type", "response.output_item.added"}, + {"item", json { + {"arguments", ""}, + {"call_id", "fc_" + diff.tool_call_delta.id}, + {"name", diff.tool_call_delta.name}, + {"type", "function_call"}, + {"status", "in_progress"}, + }}, + }}, + }); + oai_resp_fc_id = diff.tool_call_delta.id; + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back(json { + {"event", "response.function_call_arguments.delta"}, + {"data", json { + {"type", "response.function_call_arguments.delta"}, + {"delta", diff.tool_call_delta.arguments}, + {"item_id", "fc_" + oai_resp_fc_id}, + }}, + }); + } + } + return events; +} + // // server_task_result_embd // @@ -1260,8 +1600,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() { // use local copies of streaming state (copied from task_result_state in update()) // these reflect the state BEFORE this chunk was processed - bool thinking_started = anthropic_thinking_block_started; - bool text_started = anthropic_text_block_started; + bool thinking_started = thinking_block_started; + bool text_started = text_block_started; for (const auto & diff : oaicompat_msg_diffs) { // handle thinking/reasoning content diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 6835eef507..244470596b 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -33,6 +33,7 @@ enum task_response_type { TASK_RESPONSE_TYPE_NONE, // llama.cpp native format TASK_RESPONSE_TYPE_OAI_CHAT, TASK_RESPONSE_TYPE_OAI_CMPL, + TASK_RESPONSE_TYPE_OAI_RESP, TASK_RESPONSE_TYPE_OAI_EMBD, TASK_RESPONSE_TYPE_ANTHROPIC, }; @@ -98,12 +99,22 @@ struct task_result_state { std::string generated_text; // append new chunks of generated text here std::vector generated_tool_call_ids; - // for Anthropic API streaming: track content block state across chunks - bool anthropic_thinking_block_started = false; - bool anthropic_text_block_started = false; + // for OpenAI Responses and Anthropic streaming API: + // track output item / content block state across chunks + bool thinking_block_started = false; + bool text_block_started = false; + + // for OpenAI Responses streaming API + const std::string oai_resp_id; + const std::string oai_resp_reasoning_id; + const std::string oai_resp_message_id; + std::string oai_resp_fc_id; // function call ID for current args delta task_result_state(const common_chat_parser_params & chat_parser_params) - : chat_parser_params(chat_parser_params) {} + : chat_parser_params(chat_parser_params) + , oai_resp_id("resp_" + random_string()) + , oai_resp_reasoning_id("rs_" + random_string()) + , oai_resp_message_id("msg_" + random_string()) {} // parse partial tool calls and update the internal state common_chat_msg update_chat_msg( @@ -352,6 +363,11 @@ struct server_task_result_cmpl_final : server_task_result { std::vector oaicompat_msg_diffs; // to be populated by update() bool is_updated = false; + // for OpenAI Responses API + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + virtual bool is_stop() override { return true; // in stream mode, final responses are considered stop } @@ -361,6 +377,10 @@ struct server_task_result_cmpl_final : server_task_result { virtual void update(task_result_state & state) override { is_updated = true; oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs); + + oai_resp_id = state.oai_resp_id; + oai_resp_reasoning_id = state.oai_resp_reasoning_id; + oai_resp_message_id = state.oai_resp_message_id; } json to_json_non_oaicompat(); @@ -371,6 +391,10 @@ struct server_task_result_cmpl_final : server_task_result { json to_json_oaicompat_chat_stream(); + json to_json_oaicompat_resp(); + + json to_json_oaicompat_resp_stream(); + json to_json_anthropic(); json to_json_anthropic_stream(); @@ -397,45 +421,35 @@ struct server_task_result_cmpl_partial : server_task_result { std::vector oaicompat_msg_diffs; // to be populated by update() bool is_updated = false; + // Streaming state copied from task_result_state for this chunk + bool thinking_block_started = false; + bool text_block_started = false; + + // for OpenAI Responses API + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + std::string oai_resp_fc_id; + // for Anthropic API: track if any reasoning content has been generated bool anthropic_has_reasoning = false; - // Streaming state copied from task_result_state for this chunk - bool anthropic_thinking_block_started = false; - bool anthropic_text_block_started = false; virtual bool is_stop() override { return false; // in stream mode, partial responses are not considered stop } + virtual void update(task_result_state & state) override; + virtual json to_json() override; - virtual void update(task_result_state & state) override { - is_updated = true; - state.update_chat_msg(content, true, oaicompat_msg_diffs); - // track if the accumulated message has any reasoning content - anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty(); - - // Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk) - anthropic_thinking_block_started = state.anthropic_thinking_block_started; - anthropic_text_block_started = state.anthropic_text_block_started; - - // Pre-compute state updates based on diffs (for next chunk) - for (const auto & diff : oaicompat_msg_diffs) { - if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) { - state.anthropic_thinking_block_started = true; - } - if (!diff.content_delta.empty() && !state.anthropic_text_block_started) { - state.anthropic_text_block_started = true; - } - } - } - json to_json_non_oaicompat(); json to_json_oaicompat(); json to_json_oaicompat_chat(); + json to_json_oaicompat_resp(); + json to_json_anthropic(); }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 1d9abf6055..d3d4316026 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -140,6 +140,7 @@ int main(int argc, char ** argv) { routes.post_completions = models_routes->proxy_post; routes.post_completions_oai = models_routes->proxy_post; routes.post_chat_completions = models_routes->proxy_post; + routes.post_responses_oai = models_routes->proxy_post; routes.post_anthropic_messages = models_routes->proxy_post; routes.post_anthropic_count_tokens = models_routes->proxy_post; routes.post_infill = models_routes->proxy_post; @@ -176,6 +177,7 @@ int main(int argc, char ** argv) { ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index 4ea7f19f77..ca79d025ed 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -2,7 +2,7 @@ aiohttp~=3.9.3 pytest~=8.3.3 huggingface_hub>=0.34.0,<1.0 numpy~=1.26.4 -openai~=1.55.3 +openai~=2.14.0 prometheus-client~=0.20.0 requests~=2.32.3 wget~=3.2 diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py new file mode 100644 index 0000000000..7aab4a8ba6 --- /dev/null +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -0,0 +1,73 @@ +import pytest +from openai import OpenAI +from utils import * + +server: ServerProcess + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + +def test_responses_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + res = client.responses.create( + model="gpt-4.1", + input=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_output_tokens=8, + temperature=0.8, + ) + assert res.id.startswith("resp_") + assert res.output[0].id is not None + assert res.output[0].id.startswith("msg_") + assert match_regex("(Suddenly)+", res.output_text) + +def test_responses_stream_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + stream = client.responses.create( + model="gpt-4.1", + input=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_output_tokens=8, + temperature=0.8, + stream=True, + ) + + gathered_text = '' + resp_id = '' + msg_id = '' + for r in stream: + if r.type == "response.created": + assert r.response.id.startswith("resp_") + resp_id = r.response.id + if r.type == "response.in_progress": + assert r.response.id == resp_id + if r.type == "response.output_item.added": + assert r.item.id is not None + assert r.item.id.startswith("msg_") + msg_id = r.item.id + if (r.type == "response.content_part.added" or + r.type == "response.output_text.delta" or + r.type == "response.output_text.done" or + r.type == "response.content_part.done"): + assert r.item_id == msg_id + if r.type == "response.output_item.done": + assert r.item.id == msg_id + + if r.type == "response.output_text.delta": + gathered_text += r.delta + if r.type == "response.completed": + assert r.response.id.startswith("resp_") + assert r.response.output[0].id is not None + assert r.response.output[0].id.startswith("msg_") + assert gathered_text == r.response.output_text + assert match_regex("(Suddenly)+", r.response.output_text)