from previous PR
This commit is contained in:
parent
d5574c919c
commit
1df28c4053
|
|
@ -3,7 +3,7 @@ pytest~=8.3.3
|
||||||
huggingface_hub>=0.34.0,<1.0
|
huggingface_hub>=0.34.0,<1.0
|
||||||
matplotlib~=3.10.0
|
matplotlib~=3.10.0
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.55.3
|
openai~=2.14.0
|
||||||
pandas~=2.2.3
|
pandas~=2.2.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
|
|
||||||
|
|
@ -1070,6 +1070,48 @@ json oaicompat_chat_params_parse(
|
||||||
return llama_params;
|
return llama_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json convert_responses_to_chatcmpl(const json & body) {
|
||||||
|
if (!body.contains("input")) {
|
||||||
|
throw std::invalid_argument("'input' is required");
|
||||||
|
}
|
||||||
|
if (!json_value(body, "previous_response_id", std::string{}).empty()) {
|
||||||
|
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const json input_value = body.at("input");
|
||||||
|
json chatcmpl_messages = json::array();
|
||||||
|
|
||||||
|
if (input_value.is_array()) {
|
||||||
|
chatcmpl_messages = input_value;
|
||||||
|
} else if (input_value.is_string()) {
|
||||||
|
chatcmpl_messages.push_back({
|
||||||
|
{"role", "user"},
|
||||||
|
{"content", input_value},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
std::invalid_argument("'input' must be a string or array of objects");
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string instructions = json_value(body, "instructions", std::string{});
|
||||||
|
if (instructions != "") {
|
||||||
|
chatcmpl_messages.push_back({
|
||||||
|
{"role", "system"},
|
||||||
|
{"content", instructions},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
json chatcmpl_body = body;
|
||||||
|
chatcmpl_body.erase("input");
|
||||||
|
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||||
|
|
||||||
|
if (body.contains("max_output_tokens")) {
|
||||||
|
chatcmpl_body.erase("max_output_tokens");
|
||||||
|
chatcmpl_body["max_tokens"] = body["max_output_tokens"];
|
||||||
|
}
|
||||||
|
|
||||||
|
return chatcmpl_body;
|
||||||
|
}
|
||||||
|
|
||||||
json convert_anthropic_to_oai(const json & body) {
|
json convert_anthropic_to_oai(const json & body) {
|
||||||
json oai_body;
|
json oai_body;
|
||||||
|
|
||||||
|
|
@ -1478,6 +1520,24 @@ std::string format_oai_sse(const json & data) {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string format_oai_resp_sse(const json & data) {
|
||||||
|
std::ostringstream ss;
|
||||||
|
auto send_single = [&ss](const json & event_obj) {
|
||||||
|
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
|
||||||
|
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
|
||||||
|
};
|
||||||
|
|
||||||
|
if (data.is_array()) {
|
||||||
|
for (const auto & item : data) {
|
||||||
|
send_single(item);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
send_single(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
std::string format_anthropic_sse(const json & data) {
|
std::string format_anthropic_sse(const json & data) {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -295,6 +295,9 @@ json oaicompat_chat_params_parse(
|
||||||
const oaicompat_parser_options & opt,
|
const oaicompat_parser_options & opt,
|
||||||
std::vector<raw_buffer> & out_files);
|
std::vector<raw_buffer> & out_files);
|
||||||
|
|
||||||
|
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||||
|
json convert_responses_to_chatcmpl(const json & body);
|
||||||
|
|
||||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||||
json convert_anthropic_to_oai(const json & body);
|
json convert_anthropic_to_oai(const json & body);
|
||||||
|
|
||||||
|
|
@ -332,6 +335,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
|
||||||
// note: if data is a json array, it will be sent as multiple events, one per item
|
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||||
std::string format_oai_sse(const json & data);
|
std::string format_oai_sse(const json & data);
|
||||||
|
|
||||||
|
std::string format_oai_resp_sse(const json & data);
|
||||||
|
|
||||||
// format Anthropic-style SSE with event types
|
// format Anthropic-style SSE with event types
|
||||||
std::string format_anthropic_sse(const json & data);
|
std::string format_anthropic_sse(const json & data);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2981,6 +2981,58 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
json first_result_json = first_result->to_json();
|
json first_result_json = first_result->to_json();
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||||
res->data = format_anthropic_sse(first_result_json);
|
res->data = format_anthropic_sse(first_result_json);
|
||||||
|
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||||
|
const json created = {
|
||||||
|
{"event", "response.created"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.created"},
|
||||||
|
{"response", json {
|
||||||
|
{"object", "response"},
|
||||||
|
{"status", "in_progress"}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
const json in_progress = {
|
||||||
|
{"event", "response.in_progress"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.in_progress"},
|
||||||
|
{"response", json {
|
||||||
|
{"object", "response"},
|
||||||
|
{"status", "in_progress"}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
const json output_item_added = {
|
||||||
|
{"event", "response.output_item.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.added"},
|
||||||
|
{"item", json {
|
||||||
|
{"type", "message"},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
{"content", json::array()},
|
||||||
|
{"role", "assistant"}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
const json content_part_added = {
|
||||||
|
{"event", "response.content_part.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.content_part.added"},
|
||||||
|
{"part", json {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"text", ""}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
|
||||||
|
const json initial_events = json::array({
|
||||||
|
created,
|
||||||
|
in_progress,
|
||||||
|
output_item_added,
|
||||||
|
content_part_added
|
||||||
|
});
|
||||||
|
|
||||||
|
res->data = format_oai_resp_sse(initial_events) + format_oai_resp_sse(first_result_json);
|
||||||
} else {
|
} else {
|
||||||
res->data = format_oai_sse(first_result_json);
|
res->data = format_oai_sse(first_result_json);
|
||||||
}
|
}
|
||||||
|
|
@ -3015,13 +3067,16 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
|
|
||||||
// check if there is more data
|
// check if there is more data
|
||||||
if (!rd.has_next()) {
|
if (!rd.has_next()) {
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
switch (res_type) {
|
||||||
// Anthropic doesn't send [DONE], message_stop was already sent
|
case TASK_RESPONSE_TYPE_NONE:
|
||||||
output = "";
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
output = "data: [DONE]\n\n";
|
output = "";
|
||||||
} else {
|
break;
|
||||||
output = "";
|
|
||||||
|
default:
|
||||||
|
output = "data: [DONE]\n\n";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
SRV_DBG("%s", "all results received, terminating stream\n");
|
SRV_DBG("%s", "all results received, terminating stream\n");
|
||||||
return false; // no more data, terminate
|
return false; // no more data, terminate
|
||||||
|
|
@ -3049,6 +3104,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
json res_json = result->to_json();
|
json res_json = result->to_json();
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||||
output = format_anthropic_sse(res_json);
|
output = format_anthropic_sse(res_json);
|
||||||
|
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||||
|
output = format_oai_resp_sse(res_json);
|
||||||
} else {
|
} else {
|
||||||
output = format_oai_sse(res_json);
|
output = format_oai_sse(res_json);
|
||||||
}
|
}
|
||||||
|
|
@ -3479,6 +3536,22 @@ void server_routes::init_routes() {
|
||||||
TASK_RESPONSE_TYPE_OAI_CHAT);
|
TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
this->post_responses_oai = [this](const server_http_req & req) {
|
||||||
|
auto res = create_response();
|
||||||
|
std::vector<raw_buffer> files;
|
||||||
|
json body = convert_responses_to_chatcmpl(json::parse(req.body));
|
||||||
|
json body_parsed = oaicompat_chat_params_parse(
|
||||||
|
body,
|
||||||
|
ctx_server.oai_parser_opt,
|
||||||
|
files);
|
||||||
|
return handle_completions_impl(
|
||||||
|
req,
|
||||||
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
|
body_parsed,
|
||||||
|
files,
|
||||||
|
TASK_RESPONSE_TYPE_OAI_RESP);
|
||||||
|
};
|
||||||
|
|
||||||
this->post_anthropic_messages = [this](const server_http_req & req) {
|
this->post_anthropic_messages = [this](const server_http_req & req) {
|
||||||
auto res = create_response();
|
auto res = create_response();
|
||||||
std::vector<raw_buffer> files;
|
std::vector<raw_buffer> files;
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,7 @@ struct server_routes {
|
||||||
server_http_context::handler_t post_completions;
|
server_http_context::handler_t post_completions;
|
||||||
server_http_context::handler_t post_completions_oai;
|
server_http_context::handler_t post_completions_oai;
|
||||||
server_http_context::handler_t post_chat_completions;
|
server_http_context::handler_t post_chat_completions;
|
||||||
|
server_http_context::handler_t post_responses_oai;
|
||||||
server_http_context::handler_t post_anthropic_messages;
|
server_http_context::handler_t post_anthropic_messages;
|
||||||
server_http_context::handler_t post_anthropic_count_tokens;
|
server_http_context::handler_t post_anthropic_count_tokens;
|
||||||
server_http_context::handler_t post_apply_template;
|
server_http_context::handler_t post_apply_template;
|
||||||
|
|
|
||||||
|
|
@ -578,6 +578,8 @@ json server_task_result_cmpl_final::to_json() {
|
||||||
return to_json_oaicompat();
|
return to_json_oaicompat();
|
||||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||||
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
|
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
|
||||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
||||||
default:
|
default:
|
||||||
|
|
@ -795,6 +797,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
||||||
return deltas;
|
return deltas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
||||||
|
common_chat_msg msg;
|
||||||
|
if (!oaicompat_msg.empty()) {
|
||||||
|
msg = oaicompat_msg;
|
||||||
|
} else {
|
||||||
|
msg.role = "assistant";
|
||||||
|
msg.content = content;
|
||||||
|
}
|
||||||
|
|
||||||
|
const json reasoning = {
|
||||||
|
{"type", "reasoning"},
|
||||||
|
{"summary", json::array({json {
|
||||||
|
{"type", "summary_text"},
|
||||||
|
{"text", msg.reasoning_content}
|
||||||
|
}})}
|
||||||
|
};
|
||||||
|
const json message = {
|
||||||
|
{"type", "message"},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"content", json::array({json {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"annotations", json::array()},
|
||||||
|
{"logprobs", json::array()},
|
||||||
|
{"text", msg.content}
|
||||||
|
}})},
|
||||||
|
{"role", msg.role}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
json res = {
|
||||||
|
{"object", "response"},
|
||||||
|
{"created_at", t},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"output", json::array({reasoning, message})},
|
||||||
|
{"usage", json {
|
||||||
|
{"input_tokens", n_prompt_tokens},
|
||||||
|
{"output_tokens", n_decoded},
|
||||||
|
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||||
|
}},
|
||||||
|
};
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
res["__verbose"] = to_json_non_oaicompat();
|
||||||
|
}
|
||||||
|
if (timings.prompt_n >= 0) {
|
||||||
|
res.push_back({"timings", timings.to_json()});
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||||
|
json server_sent_events = json::array();
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_text.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_text.done"},
|
||||||
|
{"text", oaicompat_msg.content}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
const json part = {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"annotations", json::array()},
|
||||||
|
{"logprobs", json::array()},
|
||||||
|
{"text", oaicompat_msg.content}
|
||||||
|
};
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.content_part.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.content_part.done"},
|
||||||
|
{"part", part}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
const json item = {
|
||||||
|
{"type", "message"},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"content", json::array({part})},
|
||||||
|
{"role", "assistant"}
|
||||||
|
};
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_item.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.done"},
|
||||||
|
{"item", item}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.completed"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.completed"},
|
||||||
|
{"response", json {
|
||||||
|
{"object", "response"},
|
||||||
|
{"created_at", t},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"output", json::array({item})},
|
||||||
|
{"usage", json {
|
||||||
|
{"input_tokens", n_prompt_tokens},
|
||||||
|
{"output_tokens", n_decoded},
|
||||||
|
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
return server_sent_events;
|
||||||
|
}
|
||||||
|
|
||||||
json server_task_result_cmpl_final::to_json_anthropic() {
|
json server_task_result_cmpl_final::to_json_anthropic() {
|
||||||
std::string stop_reason = "max_tokens";
|
std::string stop_reason = "max_tokens";
|
||||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||||
|
|
@ -989,6 +1107,8 @@ json server_task_result_cmpl_partial::to_json() {
|
||||||
return to_json_oaicompat();
|
return to_json_oaicompat();
|
||||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||||
return to_json_oaicompat_chat();
|
return to_json_oaicompat_chat();
|
||||||
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
|
return to_json_oaicompat_resp();
|
||||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
return to_json_anthropic();
|
return to_json_anthropic();
|
||||||
default:
|
default:
|
||||||
|
|
@ -1058,6 +1178,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat() {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
|
||||||
|
std::vector<json> deltas;
|
||||||
|
|
||||||
|
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||||
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
|
deltas.push_back(json {
|
||||||
|
{"event", "response.reasoning_text.delta"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.reasoning_text.delta"},
|
||||||
|
{"delta", diff.reasoning_content_delta}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (!diff.content_delta.empty()) {
|
||||||
|
deltas.push_back(json {
|
||||||
|
{"event", "response.output_text.delta"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_text.delta"},
|
||||||
|
{"delta", diff.content_delta}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return deltas;
|
||||||
|
}
|
||||||
|
|
||||||
json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
||||||
bool first = n_decoded == 1;
|
bool first = n_decoded == 1;
|
||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@ enum task_response_type {
|
||||||
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
||||||
TASK_RESPONSE_TYPE_OAI_CHAT,
|
TASK_RESPONSE_TYPE_OAI_CHAT,
|
||||||
TASK_RESPONSE_TYPE_OAI_CMPL,
|
TASK_RESPONSE_TYPE_OAI_CMPL,
|
||||||
|
TASK_RESPONSE_TYPE_OAI_RESP,
|
||||||
TASK_RESPONSE_TYPE_OAI_EMBD,
|
TASK_RESPONSE_TYPE_OAI_EMBD,
|
||||||
TASK_RESPONSE_TYPE_ANTHROPIC,
|
TASK_RESPONSE_TYPE_ANTHROPIC,
|
||||||
};
|
};
|
||||||
|
|
@ -311,6 +312,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
|
|
||||||
json to_json_oaicompat_chat_stream();
|
json to_json_oaicompat_chat_stream();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp_stream();
|
||||||
|
|
||||||
json to_json_anthropic();
|
json to_json_anthropic();
|
||||||
|
|
||||||
json to_json_anthropic_stream();
|
json to_json_anthropic_stream();
|
||||||
|
|
@ -354,6 +359,8 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
|
|
||||||
json to_json_oaicompat_chat();
|
json to_json_oaicompat_chat();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp();
|
||||||
|
|
||||||
json to_json_anthropic();
|
json to_json_anthropic();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -140,6 +140,7 @@ int main(int argc, char ** argv, char ** envp) {
|
||||||
routes.post_completions = models_routes->proxy_post;
|
routes.post_completions = models_routes->proxy_post;
|
||||||
routes.post_completions_oai = models_routes->proxy_post;
|
routes.post_completions_oai = models_routes->proxy_post;
|
||||||
routes.post_chat_completions = models_routes->proxy_post;
|
routes.post_chat_completions = models_routes->proxy_post;
|
||||||
|
routes.post_responses_oai = models_routes->proxy_post;
|
||||||
routes.post_anthropic_messages = models_routes->proxy_post;
|
routes.post_anthropic_messages = models_routes->proxy_post;
|
||||||
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
||||||
routes.post_infill = models_routes->proxy_post;
|
routes.post_infill = models_routes->proxy_post;
|
||||||
|
|
@ -176,6 +177,7 @@ int main(int argc, char ** argv, char ** envp) {
|
||||||
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||||
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||||
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
||||||
|
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
|
||||||
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
||||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||||
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ aiohttp~=3.9.3
|
||||||
pytest~=8.3.3
|
pytest~=8.3.3
|
||||||
huggingface_hub>=0.34.0,<1.0
|
huggingface_hub>=0.34.0,<1.0
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.55.3
|
openai~=2.14.0
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
wget~=3.2
|
wget~=3.2
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
import pytest
|
||||||
|
from openai import OpenAI
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
server: ServerProcess
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def create_server():
|
||||||
|
global server
|
||||||
|
server = ServerPreset.tinyllama2()
|
||||||
|
|
||||||
|
def test_responses_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
res = client.responses.create(
|
||||||
|
model="gpt-4.1",
|
||||||
|
input=[
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
],
|
||||||
|
max_output_tokens=8,
|
||||||
|
temperature=0.8,
|
||||||
|
)
|
||||||
|
assert match_regex("(Suddenly)+", res.output_text)
|
||||||
|
|
||||||
|
def test_responses_stream_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
stream = client.responses.create(
|
||||||
|
model="gpt-4.1",
|
||||||
|
input=[
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
],
|
||||||
|
max_output_tokens=8,
|
||||||
|
temperature=0.8,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
gathered_text = ''
|
||||||
|
for r in stream:
|
||||||
|
if r.type == "response.output_text.delta":
|
||||||
|
gathered_text += r.delta
|
||||||
|
if r.type == "response.completed":
|
||||||
|
assert gathered_text == r.response.output_text
|
||||||
|
assert match_regex("(Suddenly)+", r.response.output_text)
|
||||||
Loading…
Reference in New Issue