From 1df28c40539df2239db91d62d39c54a49bbbcffa Mon Sep 17 00:00:00 2001 From: openingnow <> Date: Tue, 30 Dec 2025 05:44:08 +0000 Subject: [PATCH] from previous PR --- requirements/requirements-tool_bench.txt | 2 +- tools/server/server-common.cpp | 60 +++++++ tools/server/server-common.h | 5 + tools/server/server-context.cpp | 87 ++++++++++- tools/server/server-context.h | 1 + tools/server/server-task.cpp | 147 ++++++++++++++++++ tools/server/server-task.h | 7 + tools/server/server.cpp | 2 + tools/server/tests/requirements.txt | 2 +- .../tests/unit/test_compat_oai_responses.py | 48 ++++++ 10 files changed, 352 insertions(+), 9 deletions(-) create mode 100644 tools/server/tests/unit/test_compat_oai_responses.py diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index f7912aff72..3bb74fb9d0 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -3,7 +3,7 @@ pytest~=8.3.3 huggingface_hub>=0.34.0,<1.0 matplotlib~=3.10.0 numpy~=1.26.4 -openai~=1.55.3 +openai~=2.14.0 pandas~=2.2.3 prometheus-client~=0.20.0 requests~=2.32.3 diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index b02afaefda..96ec86edbc 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1070,6 +1070,48 @@ json oaicompat_chat_params_parse( return llama_params; } +json convert_responses_to_chatcmpl(const json & body) { + if (!body.contains("input")) { + throw std::invalid_argument("'input' is required"); + } + if (!json_value(body, "previous_response_id", std::string{}).empty()) { + throw std::invalid_argument("llama.cpp does not support 'previous_response_id'."); + } + + const json input_value = body.at("input"); + json chatcmpl_messages = json::array(); + + if (input_value.is_array()) { + chatcmpl_messages = input_value; + } else if (input_value.is_string()) { + chatcmpl_messages.push_back({ + {"role", "user"}, + {"content", input_value}, + }); + } else { + std::invalid_argument("'input' must be a string or array of objects"); + } + + const std::string instructions = json_value(body, "instructions", std::string{}); + if (instructions != "") { + chatcmpl_messages.push_back({ + {"role", "system"}, + {"content", instructions}, + }); + } + + json chatcmpl_body = body; + chatcmpl_body.erase("input"); + chatcmpl_body["messages"] = chatcmpl_messages; + + if (body.contains("max_output_tokens")) { + chatcmpl_body.erase("max_output_tokens"); + chatcmpl_body["max_tokens"] = body["max_output_tokens"]; + } + + return chatcmpl_body; +} + json convert_anthropic_to_oai(const json & body) { json oai_body; @@ -1478,6 +1520,24 @@ std::string format_oai_sse(const json & data) { return ss.str(); } +std::string format_oai_resp_sse(const json & data) { + std::ostringstream ss; + auto send_single = [&ss](const json & event_obj) { + ss << "event: " << event_obj.at("event").get() << "\n"; + ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; + }; + + if (data.is_array()) { + for (const auto & item : data) { + send_single(item); + } + } else { + send_single(data); + } + + return ss.str(); +} + std::string format_anthropic_sse(const json & data) { std::ostringstream ss; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 152a2a3c46..5827a6fc4d 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -295,6 +295,9 @@ json oaicompat_chat_params_parse( const oaicompat_parser_options & opt, std::vector & out_files); +// convert OpenAI Responses API format to OpenAI Chat Completions API format +json convert_responses_to_chatcmpl(const json & body); + // convert Anthropic Messages API format to OpenAI Chat Completions API format json convert_anthropic_to_oai(const json & body); @@ -332,6 +335,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l // note: if data is a json array, it will be sent as multiple events, one per item std::string format_oai_sse(const json & data); +std::string format_oai_resp_sse(const json & data); + // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9726e02522..cc2016dff2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2981,6 +2981,58 @@ std::unique_ptr server_routes::handle_completions_impl( json first_result_json = first_result->to_json(); if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { res->data = format_anthropic_sse(first_result_json); + } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { + const json created = { + {"event", "response.created"}, + {"data", json { + {"type", "response.created"}, + {"response", json { + {"object", "response"}, + {"status", "in_progress"} + }} + }} + }; + const json in_progress = { + {"event", "response.in_progress"}, + {"data", json { + {"type", "response.in_progress"}, + {"response", json { + {"object", "response"}, + {"status", "in_progress"} + }} + }} + }; + const json output_item_added = { + {"event", "response.output_item.added"}, + {"data", json { + {"type", "response.output_item.added"}, + {"item", json { + {"type", "message"}, + {"status", "in_progress"}, + {"content", json::array()}, + {"role", "assistant"} + }} + }} + }; + const json content_part_added = { + {"event", "response.content_part.added"}, + {"data", json { + {"type", "response.content_part.added"}, + {"part", json { + {"type", "output_text"}, + {"text", ""} + }} + }} + }; + + const json initial_events = json::array({ + created, + in_progress, + output_item_added, + content_part_added + }); + + res->data = format_oai_resp_sse(initial_events) + format_oai_resp_sse(first_result_json); } else { res->data = format_oai_sse(first_result_json); } @@ -3015,13 +3067,16 @@ std::unique_ptr server_routes::handle_completions_impl( // check if there is more data if (!rd.has_next()) { - if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { - // Anthropic doesn't send [DONE], message_stop was already sent - output = ""; - } else if (res_type != TASK_RESPONSE_TYPE_NONE) { - output = "data: [DONE]\n\n"; - } else { - output = ""; + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: + case TASK_RESPONSE_TYPE_OAI_RESP: + case TASK_RESPONSE_TYPE_ANTHROPIC: + output = ""; + break; + + default: + output = "data: [DONE]\n\n"; + break; } SRV_DBG("%s", "all results received, terminating stream\n"); return false; // no more data, terminate @@ -3049,6 +3104,8 @@ std::unique_ptr server_routes::handle_completions_impl( json res_json = result->to_json(); if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { output = format_anthropic_sse(res_json); + } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { + output = format_oai_resp_sse(res_json); } else { output = format_oai_sse(res_json); } @@ -3479,6 +3536,22 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_responses_oai = [this](const server_http_req & req) { + auto res = create_response(); + std::vector files; + json body = convert_responses_to_chatcmpl(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + return handle_completions_impl( + req, + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + TASK_RESPONSE_TYPE_OAI_RESP); + }; + this->post_anthropic_messages = [this](const server_http_req & req) { auto res = create_response(); std::vector files; diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 09bec15ae1..3bf81b447e 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -95,6 +95,7 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_responses_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 22f5b2059c..f8093677d5 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -578,6 +578,8 @@ json server_task_result_cmpl_final::to_json() { return to_json_oaicompat(); case TASK_RESPONSE_TYPE_OAI_CHAT: return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_OAI_RESP: + return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp(); case TASK_RESPONSE_TYPE_ANTHROPIC: return stream ? to_json_anthropic_stream() : to_json_anthropic(); default: @@ -795,6 +797,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { return deltas; } +json server_task_result_cmpl_final::to_json_oaicompat_resp() { + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } else { + msg.role = "assistant"; + msg.content = content; + } + + const json reasoning = { + {"type", "reasoning"}, + {"summary", json::array({json { + {"type", "summary_text"}, + {"text", msg.reasoning_content} + }})} + }; + const json message = { + {"type", "message"}, + {"status", "completed"}, + {"content", json::array({json { + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", msg.content} + }})}, + {"role", msg.role} + }; + + std::time_t t = std::time(0); + json res = { + {"object", "response"}, + {"created_at", t}, + {"status", "completed"}, + {"model", oaicompat_model}, + {"output", json::array({reasoning, message})}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + }; + + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; +} + +json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { + json server_sent_events = json::array(); + + server_sent_events.push_back(json { + {"event", "response.output_text.done"}, + {"data", json { + {"type", "response.output_text.done"}, + {"text", oaicompat_msg.content} + }} + }); + + const json part = { + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", oaicompat_msg.content} + }; + + server_sent_events.push_back(json { + {"event", "response.content_part.done"}, + {"data", json { + {"type", "response.content_part.done"}, + {"part", part} + }} + }); + + const json item = { + {"type", "message"}, + {"status", "completed"}, + {"content", json::array({part})}, + {"role", "assistant"} + }; + + server_sent_events.push_back(json { + {"event", "response.output_item.done"}, + {"data", json { + {"type", "response.output_item.done"}, + {"item", item} + }} + }); + + std::time_t t = std::time(0); + server_sent_events.push_back(json { + {"event", "response.completed"}, + {"data", json { + {"type", "response.completed"}, + {"response", json { + {"object", "response"}, + {"created_at", t}, + {"status", "completed"}, + {"model", oaicompat_model}, + {"output", json::array({item})}, + {"usage", json { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens} + }} + }}, + }} + }); + + return server_sent_events; +} + json server_task_result_cmpl_final::to_json_anthropic() { std::string stop_reason = "max_tokens"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { @@ -989,6 +1107,8 @@ json server_task_result_cmpl_partial::to_json() { return to_json_oaicompat(); case TASK_RESPONSE_TYPE_OAI_CHAT: return to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_OAI_RESP: + return to_json_oaicompat_resp(); case TASK_RESPONSE_TYPE_ANTHROPIC: return to_json_anthropic(); default: @@ -1058,6 +1178,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat() { return res; } +json server_task_result_cmpl_partial::to_json_oaicompat_resp() { + std::vector deltas; + + for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) { + if (!diff.reasoning_content_delta.empty()) { + deltas.push_back(json { + {"event", "response.reasoning_text.delta"}, + {"data", json { + {"type", "response.reasoning_text.delta"}, + {"delta", diff.reasoning_content_delta} + }} + }); + } + if (!diff.content_delta.empty()) { + deltas.push_back(json { + {"event", "response.output_text.delta"}, + {"data", json { + {"type", "response.output_text.delta"}, + {"delta", diff.content_delta} + }} + }); + } + } + + return deltas; +} + json server_task_result_cmpl_partial::to_json_oaicompat_chat() { bool first = n_decoded == 1; std::time_t t = std::time(0); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 687770de5e..3c411910d1 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -33,6 +33,7 @@ enum task_response_type { TASK_RESPONSE_TYPE_NONE, // llama.cpp native format TASK_RESPONSE_TYPE_OAI_CHAT, TASK_RESPONSE_TYPE_OAI_CMPL, + TASK_RESPONSE_TYPE_OAI_RESP, TASK_RESPONSE_TYPE_OAI_EMBD, TASK_RESPONSE_TYPE_ANTHROPIC, }; @@ -311,6 +312,10 @@ struct server_task_result_cmpl_final : server_task_result { json to_json_oaicompat_chat_stream(); + json to_json_oaicompat_resp(); + + json to_json_oaicompat_resp_stream(); + json to_json_anthropic(); json to_json_anthropic_stream(); @@ -354,6 +359,8 @@ struct server_task_result_cmpl_partial : server_task_result { json to_json_oaicompat_chat(); + json to_json_oaicompat_resp(); + json to_json_anthropic(); }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0fbc7b6d35..f73eb12763 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -140,6 +140,7 @@ int main(int argc, char ** argv, char ** envp) { routes.post_completions = models_routes->proxy_post; routes.post_completions_oai = models_routes->proxy_post; routes.post_chat_completions = models_routes->proxy_post; + routes.post_responses_oai = models_routes->proxy_post; routes.post_anthropic_messages = models_routes->proxy_post; routes.post_anthropic_count_tokens = models_routes->proxy_post; routes.post_infill = models_routes->proxy_post; @@ -176,6 +177,7 @@ int main(int argc, char ** argv, char ** envp) { ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index 4ea7f19f77..ca79d025ed 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -2,7 +2,7 @@ aiohttp~=3.9.3 pytest~=8.3.3 huggingface_hub>=0.34.0,<1.0 numpy~=1.26.4 -openai~=1.55.3 +openai~=2.14.0 prometheus-client~=0.20.0 requests~=2.32.3 wget~=3.2 diff --git a/tools/server/tests/unit/test_compat_oai_responses.py b/tools/server/tests/unit/test_compat_oai_responses.py new file mode 100644 index 0000000000..e168f4562d --- /dev/null +++ b/tools/server/tests/unit/test_compat_oai_responses.py @@ -0,0 +1,48 @@ +import pytest +from openai import OpenAI +from utils import * + +server: ServerProcess + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + +def test_responses_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + res = client.responses.create( + model="gpt-4.1", + input=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_output_tokens=8, + temperature=0.8, + ) + assert match_regex("(Suddenly)+", res.output_text) + +def test_responses_stream_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + stream = client.responses.create( + model="gpt-4.1", + input=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_output_tokens=8, + temperature=0.8, + stream=True, + ) + + gathered_text = '' + for r in stream: + if r.type == "response.output_text.delta": + gathered_text += r.delta + if r.type == "response.completed": + assert gathered_text == r.response.output_text + assert match_regex("(Suddenly)+", r.response.output_text)