from previous PR
This commit is contained in:
parent
d5574c919c
commit
1df28c4053
|
|
@ -3,7 +3,7 @@ pytest~=8.3.3
|
|||
huggingface_hub>=0.34.0,<1.0
|
||||
matplotlib~=3.10.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
openai~=2.14.0
|
||||
pandas~=2.2.3
|
||||
prometheus-client~=0.20.0
|
||||
requests~=2.32.3
|
||||
|
|
|
|||
|
|
@ -1070,6 +1070,48 @@ json oaicompat_chat_params_parse(
|
|||
return llama_params;
|
||||
}
|
||||
|
||||
json convert_responses_to_chatcmpl(const json & body) {
|
||||
if (!body.contains("input")) {
|
||||
throw std::invalid_argument("'input' is required");
|
||||
}
|
||||
if (!json_value(body, "previous_response_id", std::string{}).empty()) {
|
||||
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||
}
|
||||
|
||||
const json input_value = body.at("input");
|
||||
json chatcmpl_messages = json::array();
|
||||
|
||||
if (input_value.is_array()) {
|
||||
chatcmpl_messages = input_value;
|
||||
} else if (input_value.is_string()) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", input_value},
|
||||
});
|
||||
} else {
|
||||
std::invalid_argument("'input' must be a string or array of objects");
|
||||
}
|
||||
|
||||
const std::string instructions = json_value(body, "instructions", std::string{});
|
||||
if (instructions != "") {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", instructions},
|
||||
});
|
||||
}
|
||||
|
||||
json chatcmpl_body = body;
|
||||
chatcmpl_body.erase("input");
|
||||
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||
|
||||
if (body.contains("max_output_tokens")) {
|
||||
chatcmpl_body.erase("max_output_tokens");
|
||||
chatcmpl_body["max_tokens"] = body["max_output_tokens"];
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json convert_anthropic_to_oai(const json & body) {
|
||||
json oai_body;
|
||||
|
||||
|
|
@ -1478,6 +1520,24 @@ std::string format_oai_sse(const json & data) {
|
|||
return ss.str();
|
||||
}
|
||||
|
||||
std::string format_oai_resp_sse(const json & data) {
|
||||
std::ostringstream ss;
|
||||
auto send_single = [&ss](const json & event_obj) {
|
||||
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
|
||||
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
|
||||
};
|
||||
|
||||
if (data.is_array()) {
|
||||
for (const auto & item : data) {
|
||||
send_single(item);
|
||||
}
|
||||
} else {
|
||||
send_single(data);
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string format_anthropic_sse(const json & data) {
|
||||
std::ostringstream ss;
|
||||
|
||||
|
|
|
|||
|
|
@ -295,6 +295,9 @@ json oaicompat_chat_params_parse(
|
|||
const oaicompat_parser_options & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||
json convert_responses_to_chatcmpl(const json & body);
|
||||
|
||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
json convert_anthropic_to_oai(const json & body);
|
||||
|
||||
|
|
@ -332,6 +335,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
|
|||
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||
std::string format_oai_sse(const json & data);
|
||||
|
||||
std::string format_oai_resp_sse(const json & data);
|
||||
|
||||
// format Anthropic-style SSE with event types
|
||||
std::string format_anthropic_sse(const json & data);
|
||||
|
||||
|
|
|
|||
|
|
@ -2981,6 +2981,58 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
json first_result_json = first_result->to_json();
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
res->data = format_anthropic_sse(first_result_json);
|
||||
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||
const json created = {
|
||||
{"event", "response.created"},
|
||||
{"data", json {
|
||||
{"type", "response.created"},
|
||||
{"response", json {
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"}
|
||||
}}
|
||||
}}
|
||||
};
|
||||
const json in_progress = {
|
||||
{"event", "response.in_progress"},
|
||||
{"data", json {
|
||||
{"type", "response.in_progress"},
|
||||
{"response", json {
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"}
|
||||
}}
|
||||
}}
|
||||
};
|
||||
const json output_item_added = {
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json {
|
||||
{"type", "message"},
|
||||
{"status", "in_progress"},
|
||||
{"content", json::array()},
|
||||
{"role", "assistant"}
|
||||
}}
|
||||
}}
|
||||
};
|
||||
const json content_part_added = {
|
||||
{"event", "response.content_part.added"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.added"},
|
||||
{"part", json {
|
||||
{"type", "output_text"},
|
||||
{"text", ""}
|
||||
}}
|
||||
}}
|
||||
};
|
||||
|
||||
const json initial_events = json::array({
|
||||
created,
|
||||
in_progress,
|
||||
output_item_added,
|
||||
content_part_added
|
||||
});
|
||||
|
||||
res->data = format_oai_resp_sse(initial_events) + format_oai_resp_sse(first_result_json);
|
||||
} else {
|
||||
res->data = format_oai_sse(first_result_json);
|
||||
}
|
||||
|
|
@ -3015,13 +3067,16 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
|
||||
// check if there is more data
|
||||
if (!rd.has_next()) {
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
// Anthropic doesn't send [DONE], message_stop was already sent
|
||||
output = "";
|
||||
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
|
||||
output = "data: [DONE]\n\n";
|
||||
} else {
|
||||
output = "";
|
||||
switch (res_type) {
|
||||
case TASK_RESPONSE_TYPE_NONE:
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
output = "";
|
||||
break;
|
||||
|
||||
default:
|
||||
output = "data: [DONE]\n\n";
|
||||
break;
|
||||
}
|
||||
SRV_DBG("%s", "all results received, terminating stream\n");
|
||||
return false; // no more data, terminate
|
||||
|
|
@ -3049,6 +3104,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
json res_json = result->to_json();
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
output = format_anthropic_sse(res_json);
|
||||
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||
output = format_oai_resp_sse(res_json);
|
||||
} else {
|
||||
output = format_oai_sse(res_json);
|
||||
}
|
||||
|
|
@ -3479,6 +3536,22 @@ void server_routes::init_routes() {
|
|||
TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||
};
|
||||
|
||||
this->post_responses_oai = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = convert_responses_to_chatcmpl(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
body_parsed,
|
||||
files,
|
||||
TASK_RESPONSE_TYPE_OAI_RESP);
|
||||
};
|
||||
|
||||
this->post_anthropic_messages = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ struct server_routes {
|
|||
server_http_context::handler_t post_completions;
|
||||
server_http_context::handler_t post_completions_oai;
|
||||
server_http_context::handler_t post_chat_completions;
|
||||
server_http_context::handler_t post_responses_oai;
|
||||
server_http_context::handler_t post_anthropic_messages;
|
||||
server_http_context::handler_t post_anthropic_count_tokens;
|
||||
server_http_context::handler_t post_apply_template;
|
||||
|
|
|
|||
|
|
@ -578,6 +578,8 @@ json server_task_result_cmpl_final::to_json() {
|
|||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
||||
default:
|
||||
|
|
@ -795,6 +797,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
|||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
||||
common_chat_msg msg;
|
||||
if (!oaicompat_msg.empty()) {
|
||||
msg = oaicompat_msg;
|
||||
} else {
|
||||
msg.role = "assistant";
|
||||
msg.content = content;
|
||||
}
|
||||
|
||||
const json reasoning = {
|
||||
{"type", "reasoning"},
|
||||
{"summary", json::array({json {
|
||||
{"type", "summary_text"},
|
||||
{"text", msg.reasoning_content}
|
||||
}})}
|
||||
};
|
||||
const json message = {
|
||||
{"type", "message"},
|
||||
{"status", "completed"},
|
||||
{"content", json::array({json {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", msg.content}
|
||||
}})},
|
||||
{"role", msg.role}
|
||||
};
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
json res = {
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", json::array({reasoning, message})},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}},
|
||||
};
|
||||
|
||||
if (verbose) {
|
||||
res["__verbose"] = to_json_non_oaicompat();
|
||||
}
|
||||
if (timings.prompt_n >= 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||
json server_sent_events = json::array();
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_text.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.done"},
|
||||
{"text", oaicompat_msg.content}
|
||||
}}
|
||||
});
|
||||
|
||||
const json part = {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", oaicompat_msg.content}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.content_part.done"},
|
||||
{"data", json {
|
||||
{"type", "response.content_part.done"},
|
||||
{"part", part}
|
||||
}}
|
||||
});
|
||||
|
||||
const json item = {
|
||||
{"type", "message"},
|
||||
{"status", "completed"},
|
||||
{"content", json::array({part})},
|
||||
{"role", "assistant"}
|
||||
};
|
||||
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json {
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", item}
|
||||
}}
|
||||
});
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
server_sent_events.push_back(json {
|
||||
{"event", "response.completed"},
|
||||
{"data", json {
|
||||
{"type", "response.completed"},
|
||||
{"response", json {
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", json::array({item})},
|
||||
{"usage", json {
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}}
|
||||
}},
|
||||
}}
|
||||
});
|
||||
|
||||
return server_sent_events;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_anthropic() {
|
||||
std::string stop_reason = "max_tokens";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
|
|
@ -989,6 +1107,8 @@ json server_task_result_cmpl_partial::to_json() {
|
|||
return to_json_oaicompat();
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
return to_json_oaicompat_chat();
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
return to_json_oaicompat_resp();
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
return to_json_anthropic();
|
||||
default:
|
||||
|
|
@ -1058,6 +1178,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat() {
|
|||
return res;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
|
||||
std::vector<json> deltas;
|
||||
|
||||
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
deltas.push_back(json {
|
||||
{"event", "response.reasoning_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.reasoning_text.delta"},
|
||||
{"delta", diff.reasoning_content_delta}
|
||||
}}
|
||||
});
|
||||
}
|
||||
if (!diff.content_delta.empty()) {
|
||||
deltas.push_back(json {
|
||||
{"event", "response.output_text.delta"},
|
||||
{"data", json {
|
||||
{"type", "response.output_text.delta"},
|
||||
{"delta", diff.content_delta}
|
||||
}}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
||||
bool first = n_decoded == 1;
|
||||
std::time_t t = std::time(0);
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ enum task_response_type {
|
|||
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
||||
TASK_RESPONSE_TYPE_OAI_CHAT,
|
||||
TASK_RESPONSE_TYPE_OAI_CMPL,
|
||||
TASK_RESPONSE_TYPE_OAI_RESP,
|
||||
TASK_RESPONSE_TYPE_OAI_EMBD,
|
||||
TASK_RESPONSE_TYPE_ANTHROPIC,
|
||||
};
|
||||
|
|
@ -311,6 +312,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||
|
||||
json to_json_oaicompat_chat_stream();
|
||||
|
||||
json to_json_oaicompat_resp();
|
||||
|
||||
json to_json_oaicompat_resp_stream();
|
||||
|
||||
json to_json_anthropic();
|
||||
|
||||
json to_json_anthropic_stream();
|
||||
|
|
@ -354,6 +359,8 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||
|
||||
json to_json_oaicompat_chat();
|
||||
|
||||
json to_json_oaicompat_resp();
|
||||
|
||||
json to_json_anthropic();
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -140,6 +140,7 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
routes.post_completions = models_routes->proxy_post;
|
||||
routes.post_completions_oai = models_routes->proxy_post;
|
||||
routes.post_chat_completions = models_routes->proxy_post;
|
||||
routes.post_responses_oai = models_routes->proxy_post;
|
||||
routes.post_anthropic_messages = models_routes->proxy_post;
|
||||
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
||||
routes.post_infill = models_routes->proxy_post;
|
||||
|
|
@ -176,6 +177,7 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
||||
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
|
||||
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ aiohttp~=3.9.3
|
|||
pytest~=8.3.3
|
||||
huggingface_hub>=0.34.0,<1.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
openai~=2.14.0
|
||||
prometheus-client~=0.20.0
|
||||
requests~=2.32.3
|
||||
wget~=3.2
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
def test_responses_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
res = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
input=[
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
max_output_tokens=8,
|
||||
temperature=0.8,
|
||||
)
|
||||
assert match_regex("(Suddenly)+", res.output_text)
|
||||
|
||||
def test_responses_stream_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
stream = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
input=[
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
max_output_tokens=8,
|
||||
temperature=0.8,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
gathered_text = ''
|
||||
for r in stream:
|
||||
if r.type == "response.output_text.delta":
|
||||
gathered_text += r.delta
|
||||
if r.type == "response.completed":
|
||||
assert gathered_text == r.response.output_text
|
||||
assert match_regex("(Suddenly)+", r.response.output_text)
|
||||
Loading…
Reference in New Issue