from previous PR

This commit is contained in:
openingnow 2025-12-30 05:44:08 +00:00
parent d5574c919c
commit 1df28c4053
10 changed files with 352 additions and 9 deletions

View File

@ -3,7 +3,7 @@ pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
matplotlib~=3.10.0
numpy~=1.26.4
openai~=1.55.3
openai~=2.14.0
pandas~=2.2.3
prometheus-client~=0.20.0
requests~=2.32.3

View File

@ -1070,6 +1070,48 @@ json oaicompat_chat_params_parse(
return llama_params;
}
json convert_responses_to_chatcmpl(const json & body) {
if (!body.contains("input")) {
throw std::invalid_argument("'input' is required");
}
if (!json_value(body, "previous_response_id", std::string{}).empty()) {
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
}
const json input_value = body.at("input");
json chatcmpl_messages = json::array();
if (input_value.is_array()) {
chatcmpl_messages = input_value;
} else if (input_value.is_string()) {
chatcmpl_messages.push_back({
{"role", "user"},
{"content", input_value},
});
} else {
std::invalid_argument("'input' must be a string or array of objects");
}
const std::string instructions = json_value(body, "instructions", std::string{});
if (instructions != "") {
chatcmpl_messages.push_back({
{"role", "system"},
{"content", instructions},
});
}
json chatcmpl_body = body;
chatcmpl_body.erase("input");
chatcmpl_body["messages"] = chatcmpl_messages;
if (body.contains("max_output_tokens")) {
chatcmpl_body.erase("max_output_tokens");
chatcmpl_body["max_tokens"] = body["max_output_tokens"];
}
return chatcmpl_body;
}
json convert_anthropic_to_oai(const json & body) {
json oai_body;
@ -1478,6 +1520,24 @@ std::string format_oai_sse(const json & data) {
return ss.str();
}
std::string format_oai_resp_sse(const json & data) {
std::ostringstream ss;
auto send_single = [&ss](const json & event_obj) {
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
};
if (data.is_array()) {
for (const auto & item : data) {
send_single(item);
}
} else {
send_single(data);
}
return ss.str();
}
std::string format_anthropic_sse(const json & data) {
std::ostringstream ss;

View File

@ -295,6 +295,9 @@ json oaicompat_chat_params_parse(
const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files);
// convert OpenAI Responses API format to OpenAI Chat Completions API format
json convert_responses_to_chatcmpl(const json & body);
// convert Anthropic Messages API format to OpenAI Chat Completions API format
json convert_anthropic_to_oai(const json & body);
@ -332,6 +335,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
// note: if data is a json array, it will be sent as multiple events, one per item
std::string format_oai_sse(const json & data);
std::string format_oai_resp_sse(const json & data);
// format Anthropic-style SSE with event types
std::string format_anthropic_sse(const json & data);

View File

@ -2981,6 +2981,58 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
json first_result_json = first_result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
res->data = format_anthropic_sse(first_result_json);
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
const json created = {
{"event", "response.created"},
{"data", json {
{"type", "response.created"},
{"response", json {
{"object", "response"},
{"status", "in_progress"}
}}
}}
};
const json in_progress = {
{"event", "response.in_progress"},
{"data", json {
{"type", "response.in_progress"},
{"response", json {
{"object", "response"},
{"status", "in_progress"}
}}
}}
};
const json output_item_added = {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"type", "message"},
{"status", "in_progress"},
{"content", json::array()},
{"role", "assistant"}
}}
}}
};
const json content_part_added = {
{"event", "response.content_part.added"},
{"data", json {
{"type", "response.content_part.added"},
{"part", json {
{"type", "output_text"},
{"text", ""}
}}
}}
};
const json initial_events = json::array({
created,
in_progress,
output_item_added,
content_part_added
});
res->data = format_oai_resp_sse(initial_events) + format_oai_resp_sse(first_result_json);
} else {
res->data = format_oai_sse(first_result_json);
}
@ -3015,13 +3067,16 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
// check if there is more data
if (!rd.has_next()) {
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
// Anthropic doesn't send [DONE], message_stop was already sent
output = "";
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
output = "data: [DONE]\n\n";
} else {
output = "";
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
case TASK_RESPONSE_TYPE_OAI_RESP:
case TASK_RESPONSE_TYPE_ANTHROPIC:
output = "";
break;
default:
output = "data: [DONE]\n\n";
break;
}
SRV_DBG("%s", "all results received, terminating stream\n");
return false; // no more data, terminate
@ -3049,6 +3104,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
json res_json = result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
output = format_anthropic_sse(res_json);
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
output = format_oai_resp_sse(res_json);
} else {
output = format_oai_sse(res_json);
}
@ -3479,6 +3536,22 @@ void server_routes::init_routes() {
TASK_RESPONSE_TYPE_OAI_CHAT);
};
this->post_responses_oai = [this](const server_http_req & req) {
auto res = create_response();
std::vector<raw_buffer> files;
json body = convert_responses_to_chatcmpl(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
files);
return handle_completions_impl(
req,
SERVER_TASK_TYPE_COMPLETION,
body_parsed,
files,
TASK_RESPONSE_TYPE_OAI_RESP);
};
this->post_anthropic_messages = [this](const server_http_req & req) {
auto res = create_response();
std::vector<raw_buffer> files;

View File

@ -95,6 +95,7 @@ struct server_routes {
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
server_http_context::handler_t post_responses_oai;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
server_http_context::handler_t post_apply_template;

View File

@ -578,6 +578,8 @@ json server_task_result_cmpl_final::to_json() {
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return stream ? to_json_anthropic_stream() : to_json_anthropic();
default:
@ -795,6 +797,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
return deltas;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
msg = oaicompat_msg;
} else {
msg.role = "assistant";
msg.content = content;
}
const json reasoning = {
{"type", "reasoning"},
{"summary", json::array({json {
{"type", "summary_text"},
{"text", msg.reasoning_content}
}})}
};
const json message = {
{"type", "message"},
{"status", "completed"},
{"content", json::array({json {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", msg.content}
}})},
{"role", msg.role}
};
std::time_t t = std::time(0);
json res = {
{"object", "response"},
{"created_at", t},
{"status", "completed"},
{"model", oaicompat_model},
{"output", json::array({reasoning, message})},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens}
}},
};
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
return res;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
json server_sent_events = json::array();
server_sent_events.push_back(json {
{"event", "response.output_text.done"},
{"data", json {
{"type", "response.output_text.done"},
{"text", oaicompat_msg.content}
}}
});
const json part = {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", oaicompat_msg.content}
};
server_sent_events.push_back(json {
{"event", "response.content_part.done"},
{"data", json {
{"type", "response.content_part.done"},
{"part", part}
}}
});
const json item = {
{"type", "message"},
{"status", "completed"},
{"content", json::array({part})},
{"role", "assistant"}
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", item}
}}
});
std::time_t t = std::time(0);
server_sent_events.push_back(json {
{"event", "response.completed"},
{"data", json {
{"type", "response.completed"},
{"response", json {
{"object", "response"},
{"created_at", t},
{"status", "completed"},
{"model", oaicompat_model},
{"output", json::array({item})},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens}
}}
}},
}}
});
return server_sent_events;
}
json server_task_result_cmpl_final::to_json_anthropic() {
std::string stop_reason = "max_tokens";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@ -989,6 +1107,8 @@ json server_task_result_cmpl_partial::to_json() {
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return to_json_anthropic();
default:
@ -1058,6 +1178,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat() {
return res;
}
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
std::vector<json> deltas;
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty()) {
deltas.push_back(json {
{"event", "response.reasoning_text.delta"},
{"data", json {
{"type", "response.reasoning_text.delta"},
{"delta", diff.reasoning_content_delta}
}}
});
}
if (!diff.content_delta.empty()) {
deltas.push_back(json {
{"event", "response.output_text.delta"},
{"data", json {
{"type", "response.output_text.delta"},
{"delta", diff.content_delta}
}}
});
}
}
return deltas;
}
json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
bool first = n_decoded == 1;
std::time_t t = std::time(0);

View File

@ -33,6 +33,7 @@ enum task_response_type {
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
TASK_RESPONSE_TYPE_OAI_CHAT,
TASK_RESPONSE_TYPE_OAI_CMPL,
TASK_RESPONSE_TYPE_OAI_RESP,
TASK_RESPONSE_TYPE_OAI_EMBD,
TASK_RESPONSE_TYPE_ANTHROPIC,
};
@ -311,6 +312,10 @@ struct server_task_result_cmpl_final : server_task_result {
json to_json_oaicompat_chat_stream();
json to_json_oaicompat_resp();
json to_json_oaicompat_resp_stream();
json to_json_anthropic();
json to_json_anthropic_stream();
@ -354,6 +359,8 @@ struct server_task_result_cmpl_partial : server_task_result {
json to_json_oaicompat_chat();
json to_json_oaicompat_resp();
json to_json_anthropic();
};

View File

@ -140,6 +140,7 @@ int main(int argc, char ** argv, char ** envp) {
routes.post_completions = models_routes->proxy_post;
routes.post_completions_oai = models_routes->proxy_post;
routes.post_chat_completions = models_routes->proxy_post;
routes.post_responses_oai = models_routes->proxy_post;
routes.post_anthropic_messages = models_routes->proxy_post;
routes.post_anthropic_count_tokens = models_routes->proxy_post;
routes.post_infill = models_routes->proxy_post;
@ -176,6 +177,7 @@ int main(int argc, char ** argv, char ** envp) {
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
ctx_http.post("/infill", ex_wrapper(routes.post_infill));

View File

@ -2,7 +2,7 @@ aiohttp~=3.9.3
pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
numpy~=1.26.4
openai~=1.55.3
openai~=2.14.0
prometheus-client~=0.20.0
requests~=2.32.3
wget~=3.2

View File

@ -0,0 +1,48 @@
import pytest
from openai import OpenAI
from utils import *
server: ServerProcess
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
def test_responses_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.responses.create(
model="gpt-4.1",
input=[
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
max_output_tokens=8,
temperature=0.8,
)
assert match_regex("(Suddenly)+", res.output_text)
def test_responses_stream_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
stream = client.responses.create(
model="gpt-4.1",
input=[
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
max_output_tokens=8,
temperature=0.8,
stream=True,
)
gathered_text = ''
for r in stream:
if r.type == "response.output_text.delta":
gathered_text += r.delta
if r.type == "response.completed":
assert gathered_text == r.response.output_text
assert match_regex("(Suddenly)+", r.response.output_text)