server: /v1/responses (partial) (#18486)
* from previous PR * Make instruction(system) as first message * Convert [input_message] (text/image/file) * Rename convert_responses_to_chatcmpl(body) -> response_body * Initial tool call support * Erase instructions field from chatcmpl body * Feed reasoning texts to chat template * Use std::vector instead of opaque json array * Make output_item.added events consistent * Move `server_task_result_cmpl_partial::update` from header to source * Match ID of output_item.added and .done events * Add function_call only if there is no "fc_" prefix * Add function call output at non-streaming API * Test if ID is persistent * Add doc * Fix style - use trailing comma * Rewrite state management * catch up with upstream/master * Fix style - "type" is the first item of SSE data * Explicitly check "instructions" from response_body * Make lambdas static * Check if reasoning content exists * Add `oai_resp_id` to task_result_state(also initialized at ctor), server_task_result_cmpl_partial, and server_task_result_cmpl_final * Reject `input_file` since it is not supported by chatcmpl * Add "fc_" prefix to non-straming function call id as coderabbit pointed out --------- Co-authored-by: openingnow <>
This commit is contained in:
parent
33f890e579
commit
fbbf3ad190
|
|
@ -3,7 +3,7 @@ pytest~=8.3.3
|
||||||
huggingface_hub>=0.34.0,<1.0
|
huggingface_hub>=0.34.0,<1.0
|
||||||
matplotlib~=3.10.0
|
matplotlib~=3.10.0
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.55.3
|
openai~=2.14.0
|
||||||
pandas~=2.2.3
|
pandas~=2.2.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ Set of LLM REST APIs and a web UI to interact with llama.cpp.
|
||||||
|
|
||||||
**Features:**
|
**Features:**
|
||||||
* LLM inference of F16 and quantized models on GPU and CPU
|
* LLM inference of F16 and quantized models on GPU and CPU
|
||||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
|
||||||
* [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
|
* [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
|
||||||
* Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
|
* Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
|
||||||
* Parallel decoding with multi-user support
|
* Parallel decoding with multi-user support
|
||||||
|
|
@ -1267,6 +1267,49 @@ This provides information on the performance of the server. It also allows calcu
|
||||||
|
|
||||||
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
||||||
|
|
||||||
|
### POST `/v1/responses`: OpenAI-compatible Responses API
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
You can use either Python `openai` library with appropriate checkpoints:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||||
|
api_key = "sk-no-key-required"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.responses.create(
|
||||||
|
model="gpt-4.1",
|
||||||
|
instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||||
|
input="Write a limerick about python exceptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response.output_text)
|
||||||
|
```
|
||||||
|
|
||||||
|
... or raw HTTP requests:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/responses \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-4.1",
|
||||||
|
"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||||
|
"input": "Write a limerick about python exceptions"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
This endpoint works by converting Responses request into Chat Completions request.
|
||||||
|
|
||||||
|
|
||||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||||
|
|
||||||
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
||||||
|
|
|
||||||
|
|
@ -1069,6 +1069,283 @@ json oaicompat_chat_params_parse(
|
||||||
return llama_params;
|
return llama_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json convert_responses_to_chatcmpl(const json & response_body) {
|
||||||
|
if (!response_body.contains("input")) {
|
||||||
|
throw std::invalid_argument("'input' is required");
|
||||||
|
}
|
||||||
|
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
|
||||||
|
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const json input_value = response_body.at("input");
|
||||||
|
json chatcmpl_body = response_body;
|
||||||
|
chatcmpl_body.erase("input");
|
||||||
|
std::vector<json> chatcmpl_messages;
|
||||||
|
|
||||||
|
if (response_body.contains("instructions")) {
|
||||||
|
chatcmpl_messages.push_back({
|
||||||
|
{"role", "system"},
|
||||||
|
{"content", json_value(response_body, "instructions", std::string())},
|
||||||
|
});
|
||||||
|
chatcmpl_body.erase("instructions");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (input_value.is_string()) {
|
||||||
|
// #responses_create-input-text_input
|
||||||
|
chatcmpl_messages.push_back({
|
||||||
|
{"role", "user"},
|
||||||
|
{"content", input_value},
|
||||||
|
});
|
||||||
|
} else if (input_value.is_array()) {
|
||||||
|
// #responses_create-input-input_item_list
|
||||||
|
|
||||||
|
static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
|
||||||
|
return j.contains(key) && j.at(key).is_array();
|
||||||
|
};
|
||||||
|
static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
|
||||||
|
return j.contains(key) && j.at(key).is_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
for (json item : input_value) {
|
||||||
|
if (exists_and_is_string(item, "content")) {
|
||||||
|
// #responses_create-input-input_item_list-input_message-content-text_input
|
||||||
|
// Only "Input message" contains item["content"]::string
|
||||||
|
// After converting item["content"]::string to item["content"]::array,
|
||||||
|
// we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
|
||||||
|
item["content"] = json::array({
|
||||||
|
json {
|
||||||
|
{"text", item.at("content")},
|
||||||
|
{"type", "input_text"}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (exists_and_is_array(item, "content") &&
|
||||||
|
exists_and_is_string(item, "role") &&
|
||||||
|
(item.at("role") == "user" ||
|
||||||
|
item.at("role") == "system" ||
|
||||||
|
item.at("role") == "developer")
|
||||||
|
) {
|
||||||
|
// #responses_create-input-input_item_list-item-input_message
|
||||||
|
std::vector<json> chatcmpl_content;
|
||||||
|
|
||||||
|
for (const json & input_item : item.at("content")) {
|
||||||
|
const std::string type = json_value(input_item, "type", std::string());
|
||||||
|
|
||||||
|
if (type == "input_text") {
|
||||||
|
if (!input_item.contains("text")) {
|
||||||
|
throw std::invalid_argument("'Input text' requires 'text'");
|
||||||
|
}
|
||||||
|
chatcmpl_content.push_back({
|
||||||
|
{"text", input_item.at("text")},
|
||||||
|
{"type", "text"},
|
||||||
|
});
|
||||||
|
} else if (type == "input_image") {
|
||||||
|
// While `detail` is marked as required,
|
||||||
|
// it has default value("auto") and can be omitted.
|
||||||
|
|
||||||
|
if (!input_item.contains("image_url")) {
|
||||||
|
throw std::invalid_argument("'image_url' is required");
|
||||||
|
}
|
||||||
|
chatcmpl_content.push_back({
|
||||||
|
{"image_url", json {
|
||||||
|
{"url", input_item.at("image_url")}
|
||||||
|
}},
|
||||||
|
{"type", "image_url"},
|
||||||
|
});
|
||||||
|
} else if (type == "input_file") {
|
||||||
|
throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
|
||||||
|
// if (input_item.contains("file_url")) {
|
||||||
|
// // chat completion API does not support file_url
|
||||||
|
// throw std::invalid_argument("'file_url' is not supported");
|
||||||
|
// }
|
||||||
|
// if (!input_item.contains("file_data") || !input_item.contains("filename")) {
|
||||||
|
// throw std::invalid_argument("Both 'file_data' and 'filename' are required");
|
||||||
|
// }
|
||||||
|
// chatcmpl_content.push_back({
|
||||||
|
// {"file", json {
|
||||||
|
// {"file_data", input_item.at("file_data")},
|
||||||
|
// {"filename", input_item.at("filename")},
|
||||||
|
// }},
|
||||||
|
// {"type", "file"},
|
||||||
|
// });
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (item.contains("type")) {
|
||||||
|
item.erase("type");
|
||||||
|
}
|
||||||
|
if (item.contains("status")) {
|
||||||
|
item.erase("status");
|
||||||
|
}
|
||||||
|
item["content"] = chatcmpl_content;
|
||||||
|
|
||||||
|
chatcmpl_messages.push_back(item);
|
||||||
|
} else if (exists_and_is_array(item, "content") &&
|
||||||
|
exists_and_is_string(item, "role") &&
|
||||||
|
item.at("role") == "assistant" &&
|
||||||
|
// exists_and_is_string(item, "status") &&
|
||||||
|
// (item.at("status") == "in_progress" ||
|
||||||
|
// item.at("status") == "completed" ||
|
||||||
|
// item.at("status") == "incomplete") &&
|
||||||
|
// item["status"] not sent by codex-cli
|
||||||
|
exists_and_is_string(item, "type") &&
|
||||||
|
item.at("type") == "message"
|
||||||
|
) {
|
||||||
|
// #responses_create-input-input_item_list-item-output_message
|
||||||
|
std::vector<json> chatcmpl_content;
|
||||||
|
|
||||||
|
for (const auto & output_text : item.at("content")) {
|
||||||
|
const std::string type = json_value(output_text, "type", std::string());
|
||||||
|
if (type != "output_text") {
|
||||||
|
throw std::invalid_argument("'type' must be 'output_text'");
|
||||||
|
}
|
||||||
|
if (!exists_and_is_string(output_text, "text")) {
|
||||||
|
throw std::invalid_argument("'Output text' requires 'text'");
|
||||||
|
}
|
||||||
|
// Ignore annotations and logprobs for now
|
||||||
|
chatcmpl_content.push_back({
|
||||||
|
{"text", output_text.at("text")},
|
||||||
|
{"type", "text"},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
item.erase("status");
|
||||||
|
item.erase("type");
|
||||||
|
item["content"] = chatcmpl_content;
|
||||||
|
chatcmpl_messages.push_back(item);
|
||||||
|
} else if (exists_and_is_string(item, "arguments") &&
|
||||||
|
exists_and_is_string(item, "call_id") &&
|
||||||
|
exists_and_is_string(item, "name") &&
|
||||||
|
exists_and_is_string(item, "type") &&
|
||||||
|
item.at("type") == "function_call"
|
||||||
|
) {
|
||||||
|
// #responses_create-input-input_item_list-item-function_tool_call
|
||||||
|
json msg = json {
|
||||||
|
{"role", "assistant"},
|
||||||
|
{"tool_calls", json::array({ json {
|
||||||
|
{"function", json {
|
||||||
|
{"arguments", item.at("arguments")},
|
||||||
|
{"name", item.at("name")},
|
||||||
|
}},
|
||||||
|
{"id", item.at("call_id")},
|
||||||
|
{"type", "function"},
|
||||||
|
}})},
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
|
||||||
|
// Move reasoning content from dummy message to tool call message
|
||||||
|
msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
|
||||||
|
chatcmpl_messages.pop_back();
|
||||||
|
}
|
||||||
|
chatcmpl_messages.push_back(msg);
|
||||||
|
} else if (exists_and_is_string(item, "call_id") &&
|
||||||
|
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
|
||||||
|
exists_and_is_string(item, "type") &&
|
||||||
|
item.at("type") == "function_call_output"
|
||||||
|
) {
|
||||||
|
// #responses_create-input-input_item_list-item-function_tool_call_output
|
||||||
|
if (item.at("output").is_string()) {
|
||||||
|
chatcmpl_messages.push_back(json {
|
||||||
|
{"content", item.at("output")},
|
||||||
|
{"role", "tool"},
|
||||||
|
{"tool_call_id", item.at("call_id")},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
json chatcmpl_outputs = item.at("output");
|
||||||
|
for (json & chatcmpl_output : chatcmpl_outputs) {
|
||||||
|
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
|
||||||
|
throw std::invalid_argument("Output of tool call should be 'Input text'");
|
||||||
|
}
|
||||||
|
chatcmpl_output["type"] = "text";
|
||||||
|
}
|
||||||
|
chatcmpl_messages.push_back(json {
|
||||||
|
{"content", chatcmpl_outputs},
|
||||||
|
{"role", "tool"},
|
||||||
|
{"tool_call_id", item.at("call_id")},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (// exists_and_is_string(item, "id") &&
|
||||||
|
// item["id"] not sent by codex-cli
|
||||||
|
exists_and_is_array(item, "summary") &&
|
||||||
|
exists_and_is_string(item, "type") &&
|
||||||
|
item.at("type") == "reasoning") {
|
||||||
|
// #responses_create-input-input_item_list-item-reasoning
|
||||||
|
|
||||||
|
if (!exists_and_is_array(item, "content")) {
|
||||||
|
throw std::invalid_argument("item['content'] is not an array");
|
||||||
|
}
|
||||||
|
if (item.at("content").empty()) {
|
||||||
|
throw std::invalid_argument("item['content'] is empty");
|
||||||
|
}
|
||||||
|
if (!exists_and_is_string(item.at("content")[0], "text")) {
|
||||||
|
throw std::invalid_argument("item['content']['text'] is not a string");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack reasoning content in dummy message
|
||||||
|
chatcmpl_messages.push_back(json {
|
||||||
|
{"role", "assistant"},
|
||||||
|
{"content", json::array()},
|
||||||
|
{"reasoning_content", item.at("content")[0].at("text")},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("Cannot determine type of 'item'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("'input' must be a string or array of objects");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove unused dummy message which contains
|
||||||
|
// reasoning content not followed by tool call
|
||||||
|
chatcmpl_messages.erase(std::remove_if(
|
||||||
|
chatcmpl_messages.begin(),
|
||||||
|
chatcmpl_messages.end(),
|
||||||
|
[](const json & x){ return x.contains("role") &&
|
||||||
|
x.at("role") == "assistant" &&
|
||||||
|
x.contains("content") &&
|
||||||
|
x.at("content") == json::array() &&
|
||||||
|
x.contains("reasoning_content");
|
||||||
|
}),
|
||||||
|
chatcmpl_messages.end()
|
||||||
|
);
|
||||||
|
|
||||||
|
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||||
|
|
||||||
|
if (response_body.contains("tools")) {
|
||||||
|
if (!response_body.at("tools").is_array()) {
|
||||||
|
throw std::invalid_argument("'tools' must be an array of objects");
|
||||||
|
}
|
||||||
|
std::vector<json> chatcmpl_tools;
|
||||||
|
for (json resp_tool : response_body.at("tools")) {
|
||||||
|
json chatcmpl_tool;
|
||||||
|
|
||||||
|
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||||
|
throw std::invalid_argument("'type' of tool must be 'function'");
|
||||||
|
}
|
||||||
|
resp_tool.erase("type");
|
||||||
|
chatcmpl_tool["type"] = "function";
|
||||||
|
|
||||||
|
if (!resp_tool.contains("strict")) {
|
||||||
|
resp_tool["strict"] = true;
|
||||||
|
}
|
||||||
|
chatcmpl_tool["function"] = resp_tool;
|
||||||
|
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||||
|
}
|
||||||
|
chatcmpl_body.erase("tools");
|
||||||
|
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response_body.contains("max_output_tokens")) {
|
||||||
|
chatcmpl_body.erase("max_output_tokens");
|
||||||
|
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||||
|
}
|
||||||
|
|
||||||
|
return chatcmpl_body;
|
||||||
|
}
|
||||||
|
|
||||||
json convert_anthropic_to_oai(const json & body) {
|
json convert_anthropic_to_oai(const json & body) {
|
||||||
json oai_body;
|
json oai_body;
|
||||||
|
|
||||||
|
|
@ -1482,6 +1759,24 @@ std::string format_oai_sse(const json & data) {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string format_oai_resp_sse(const json & data) {
|
||||||
|
std::ostringstream ss;
|
||||||
|
auto send_single = [&ss](const json & event_obj) {
|
||||||
|
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
|
||||||
|
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
|
||||||
|
};
|
||||||
|
|
||||||
|
if (data.is_array()) {
|
||||||
|
for (const auto & item : data) {
|
||||||
|
send_single(item);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
send_single(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
std::string format_anthropic_sse(const json & data) {
|
std::string format_anthropic_sse(const json & data) {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -294,6 +294,9 @@ json oaicompat_chat_params_parse(
|
||||||
const server_chat_params & opt,
|
const server_chat_params & opt,
|
||||||
std::vector<raw_buffer> & out_files);
|
std::vector<raw_buffer> & out_files);
|
||||||
|
|
||||||
|
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||||
|
json convert_responses_to_chatcmpl(const json & body);
|
||||||
|
|
||||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||||
json convert_anthropic_to_oai(const json & body);
|
json convert_anthropic_to_oai(const json & body);
|
||||||
|
|
||||||
|
|
@ -331,6 +334,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
|
||||||
// note: if data is a json array, it will be sent as multiple events, one per item
|
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||||
std::string format_oai_sse(const json & data);
|
std::string format_oai_sse(const json & data);
|
||||||
|
|
||||||
|
std::string format_oai_resp_sse(const json & data);
|
||||||
|
|
||||||
// format Anthropic-style SSE with event types
|
// format Anthropic-style SSE with event types
|
||||||
std::string format_anthropic_sse(const json & data);
|
std::string format_anthropic_sse(const json & data);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3073,6 +3073,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
json first_result_json = first_result->to_json();
|
json first_result_json = first_result->to_json();
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||||
res->data = format_anthropic_sse(first_result_json);
|
res->data = format_anthropic_sse(first_result_json);
|
||||||
|
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||||
|
res->data = format_oai_resp_sse(first_result_json);
|
||||||
} else {
|
} else {
|
||||||
res->data = format_oai_sse(first_result_json);
|
res->data = format_oai_sse(first_result_json);
|
||||||
}
|
}
|
||||||
|
|
@ -3107,13 +3109,16 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
|
|
||||||
// check if there is more data
|
// check if there is more data
|
||||||
if (!rd.has_next()) {
|
if (!rd.has_next()) {
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
switch (res_type) {
|
||||||
// Anthropic doesn't send [DONE], message_stop was already sent
|
case TASK_RESPONSE_TYPE_NONE:
|
||||||
output = "";
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
output = "data: [DONE]\n\n";
|
output = "";
|
||||||
} else {
|
break;
|
||||||
output = "";
|
|
||||||
|
default:
|
||||||
|
output = "data: [DONE]\n\n";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
SRV_DBG("%s", "all results received, terminating stream\n");
|
SRV_DBG("%s", "all results received, terminating stream\n");
|
||||||
return false; // no more data, terminate
|
return false; // no more data, terminate
|
||||||
|
|
@ -3141,6 +3146,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
json res_json = result->to_json();
|
json res_json = result->to_json();
|
||||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||||
output = format_anthropic_sse(res_json);
|
output = format_anthropic_sse(res_json);
|
||||||
|
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
|
||||||
|
output = format_oai_resp_sse(res_json);
|
||||||
} else {
|
} else {
|
||||||
output = format_oai_sse(res_json);
|
output = format_oai_sse(res_json);
|
||||||
}
|
}
|
||||||
|
|
@ -3575,6 +3582,22 @@ void server_routes::init_routes() {
|
||||||
TASK_RESPONSE_TYPE_OAI_CHAT);
|
TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
this->post_responses_oai = [this](const server_http_req & req) {
|
||||||
|
auto res = create_response();
|
||||||
|
std::vector<raw_buffer> files;
|
||||||
|
json body = convert_responses_to_chatcmpl(json::parse(req.body));
|
||||||
|
json body_parsed = oaicompat_chat_params_parse(
|
||||||
|
body,
|
||||||
|
meta->chat_params,
|
||||||
|
files);
|
||||||
|
return handle_completions_impl(
|
||||||
|
req,
|
||||||
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
|
body_parsed,
|
||||||
|
files,
|
||||||
|
TASK_RESPONSE_TYPE_OAI_RESP);
|
||||||
|
};
|
||||||
|
|
||||||
this->post_anthropic_messages = [this](const server_http_req & req) {
|
this->post_anthropic_messages = [this](const server_http_req & req) {
|
||||||
auto res = create_response();
|
auto res = create_response();
|
||||||
std::vector<raw_buffer> files;
|
std::vector<raw_buffer> files;
|
||||||
|
|
|
||||||
|
|
@ -94,6 +94,7 @@ struct server_routes {
|
||||||
server_http_context::handler_t post_completions;
|
server_http_context::handler_t post_completions;
|
||||||
server_http_context::handler_t post_completions_oai;
|
server_http_context::handler_t post_completions_oai;
|
||||||
server_http_context::handler_t post_chat_completions;
|
server_http_context::handler_t post_chat_completions;
|
||||||
|
server_http_context::handler_t post_responses_oai;
|
||||||
server_http_context::handler_t post_anthropic_messages;
|
server_http_context::handler_t post_anthropic_messages;
|
||||||
server_http_context::handler_t post_anthropic_count_tokens;
|
server_http_context::handler_t post_anthropic_count_tokens;
|
||||||
server_http_context::handler_t post_apply_template;
|
server_http_context::handler_t post_apply_template;
|
||||||
|
|
|
||||||
|
|
@ -584,6 +584,8 @@ json server_task_result_cmpl_final::to_json() {
|
||||||
return to_json_oaicompat();
|
return to_json_oaicompat();
|
||||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
|
||||||
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
|
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
|
||||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
return stream ? to_json_anthropic_stream() : to_json_anthropic();
|
||||||
default:
|
default:
|
||||||
|
|
@ -801,6 +803,186 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
||||||
return deltas;
|
return deltas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
|
||||||
|
common_chat_msg msg;
|
||||||
|
if (!oaicompat_msg.empty()) {
|
||||||
|
msg = oaicompat_msg;
|
||||||
|
} else {
|
||||||
|
msg.role = "assistant";
|
||||||
|
msg.content = content;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<json> output;
|
||||||
|
|
||||||
|
if (msg.reasoning_content != "") {
|
||||||
|
output.push_back(json {
|
||||||
|
{"id", "rs_" + random_string()},
|
||||||
|
{"summary", json::array()},
|
||||||
|
{"type", "reasoning"},
|
||||||
|
{"content", json::array({ json {
|
||||||
|
{"text", msg.reasoning_content},
|
||||||
|
{"type", "reasoning_text"},
|
||||||
|
}})},
|
||||||
|
{"encrypted_content", ""},
|
||||||
|
{"status", "completed"},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg.content != "") {
|
||||||
|
output.push_back(json {
|
||||||
|
{"content", json::array({ json {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"annotations", json::array()},
|
||||||
|
{"logprobs", json::array()},
|
||||||
|
{"text", msg.content},
|
||||||
|
}})},
|
||||||
|
{"id", "msg_" + random_string()},
|
||||||
|
{"role", msg.role},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"type", "message"},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||||
|
output.push_back(json {
|
||||||
|
{"type", "function_call"},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"arguments", tool_call.arguments},
|
||||||
|
{"call_id", "fc_" + tool_call.id},
|
||||||
|
{"name", tool_call.name},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
json res = {
|
||||||
|
{"completed_at", t},
|
||||||
|
{"created_at", t},
|
||||||
|
{"id", oai_resp_id},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"object", "response"},
|
||||||
|
{"output", output},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"usage", json {
|
||||||
|
{"input_tokens", n_prompt_tokens},
|
||||||
|
{"output_tokens", n_decoded},
|
||||||
|
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||||
|
}},
|
||||||
|
};
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||||
|
std::vector<json> server_sent_events;
|
||||||
|
std::vector<json> output;
|
||||||
|
|
||||||
|
if (oaicompat_msg.reasoning_content != "") {
|
||||||
|
const json output_item = json {
|
||||||
|
{"id", oai_resp_reasoning_id},
|
||||||
|
{"summary", json::array()},
|
||||||
|
{"type", "reasoning"},
|
||||||
|
{"content", json::array({ json {
|
||||||
|
{"text", oaicompat_msg.reasoning_content},
|
||||||
|
{"type", "reasoning_text"},
|
||||||
|
}})},
|
||||||
|
{"encrypted_content", ""},
|
||||||
|
};
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_item.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.done"},
|
||||||
|
{"item", output_item}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
output.push_back(output_item);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oaicompat_msg.content != "") {
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_text.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_text.done"},
|
||||||
|
{"item_id", oai_resp_message_id},
|
||||||
|
{"text", oaicompat_msg.content}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
const json content_part = {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"annotations", json::array()},
|
||||||
|
{"logprobs", json::array()},
|
||||||
|
{"text", oaicompat_msg.content}
|
||||||
|
};
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.content_part.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.content_part.done"},
|
||||||
|
{"item_id", oai_resp_message_id},
|
||||||
|
{"part", content_part}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
const json output_item = {
|
||||||
|
{"type", "message"},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"id", oai_resp_message_id},
|
||||||
|
{"content", json::array({content_part})},
|
||||||
|
{"role", "assistant"}
|
||||||
|
};
|
||||||
|
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_item.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.done"},
|
||||||
|
{"item", output_item}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
output.push_back(output_item);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
|
||||||
|
const json output_item = {
|
||||||
|
{"type", "function_call"},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"arguments", tool_call.arguments},
|
||||||
|
{"call_id", "fc_" + tool_call.id},
|
||||||
|
{"name", tool_call.name}
|
||||||
|
};
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.output_item.done"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.done"},
|
||||||
|
{"item", output_item}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
output.push_back(output_item);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
server_sent_events.push_back(json {
|
||||||
|
{"event", "response.completed"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.completed"},
|
||||||
|
{"response", json {
|
||||||
|
{"id", oai_resp_id},
|
||||||
|
{"object", "response"},
|
||||||
|
{"created_at", t},
|
||||||
|
{"status", "completed"},
|
||||||
|
{"model", oaicompat_model},
|
||||||
|
{"output", output},
|
||||||
|
{"usage", json {
|
||||||
|
{"input_tokens", n_prompt_tokens},
|
||||||
|
{"output_tokens", n_decoded},
|
||||||
|
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
|
||||||
|
return server_sent_events;
|
||||||
|
}
|
||||||
|
|
||||||
json server_task_result_cmpl_final::to_json_anthropic() {
|
json server_task_result_cmpl_final::to_json_anthropic() {
|
||||||
std::string stop_reason = "max_tokens";
|
std::string stop_reason = "max_tokens";
|
||||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||||
|
|
@ -1057,6 +1239,36 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||||
//
|
//
|
||||||
// server_task_result_cmpl_partial
|
// server_task_result_cmpl_partial
|
||||||
//
|
//
|
||||||
|
void server_task_result_cmpl_partial::update(task_result_state & state) {
|
||||||
|
is_updated = true;
|
||||||
|
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||||
|
|
||||||
|
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
|
||||||
|
thinking_block_started = state.thinking_block_started;
|
||||||
|
text_block_started = state.text_block_started;
|
||||||
|
|
||||||
|
oai_resp_id = state.oai_resp_id;
|
||||||
|
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
||||||
|
oai_resp_message_id = state.oai_resp_message_id;
|
||||||
|
oai_resp_fc_id = state.oai_resp_fc_id;
|
||||||
|
|
||||||
|
// track if the accumulated message has any reasoning content
|
||||||
|
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||||
|
|
||||||
|
// Pre-compute state updates based on diffs (for next chunk)
|
||||||
|
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||||
|
if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
|
||||||
|
state.thinking_block_started = true;
|
||||||
|
}
|
||||||
|
if (!diff.content_delta.empty() && !state.text_block_started) {
|
||||||
|
state.text_block_started = true;
|
||||||
|
}
|
||||||
|
if (!diff.tool_call_delta.name.empty()) {
|
||||||
|
state.oai_resp_fc_id = diff.tool_call_delta.id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
json server_task_result_cmpl_partial::to_json() {
|
json server_task_result_cmpl_partial::to_json() {
|
||||||
GGML_ASSERT(is_updated && "update() must be called before to_json()");
|
GGML_ASSERT(is_updated && "update() must be called before to_json()");
|
||||||
switch (res_type) {
|
switch (res_type) {
|
||||||
|
|
@ -1066,6 +1278,8 @@ json server_task_result_cmpl_partial::to_json() {
|
||||||
return to_json_oaicompat();
|
return to_json_oaicompat();
|
||||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||||
return to_json_oaicompat_chat();
|
return to_json_oaicompat_chat();
|
||||||
|
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||||
|
return to_json_oaicompat_resp();
|
||||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||||
return to_json_anthropic();
|
return to_json_anthropic();
|
||||||
default:
|
default:
|
||||||
|
|
@ -1190,6 +1404,132 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
||||||
return deltas;
|
return deltas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
|
||||||
|
std::vector<json> events;
|
||||||
|
|
||||||
|
if (n_decoded == 1) {
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.created"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.created"},
|
||||||
|
{"response", json {
|
||||||
|
{"id", oai_resp_id},
|
||||||
|
{"object", "response"},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.in_progress"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.in_progress"},
|
||||||
|
{"response", json {
|
||||||
|
{"id", oai_resp_id},
|
||||||
|
{"object", "response"},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
|
||||||
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
|
if (!thinking_block_started) {
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.output_item.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.added"},
|
||||||
|
{"item", json {
|
||||||
|
{"id", oai_resp_reasoning_id},
|
||||||
|
{"summary", json::array()},
|
||||||
|
{"type", "reasoning"},
|
||||||
|
{"content", json::array()},
|
||||||
|
{"encrypted_content", ""},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
thinking_block_started = true;
|
||||||
|
}
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.reasoning_text.delta"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.reasoning_text.delta"},
|
||||||
|
{"delta", diff.reasoning_content_delta},
|
||||||
|
{"item_id", oai_resp_reasoning_id},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!diff.content_delta.empty()) {
|
||||||
|
if (!text_block_started) {
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.output_item.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.added"},
|
||||||
|
{"item", json {
|
||||||
|
{"content", json::array()},
|
||||||
|
{"id", oai_resp_message_id},
|
||||||
|
{"role", "assistant"},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
{"type", "message"},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.content_part.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.content_part.added"},
|
||||||
|
{"item_id", oai_resp_message_id},
|
||||||
|
{"part", json {
|
||||||
|
{"type", "output_text"},
|
||||||
|
{"text", ""},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
text_block_started = true;
|
||||||
|
}
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.output_text.delta"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_text.delta"},
|
||||||
|
{"item_id", oai_resp_message_id},
|
||||||
|
{"delta", diff.content_delta},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!diff.tool_call_delta.name.empty()) {
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.output_item.added"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.output_item.added"},
|
||||||
|
{"item", json {
|
||||||
|
{"arguments", ""},
|
||||||
|
{"call_id", "fc_" + diff.tool_call_delta.id},
|
||||||
|
{"name", diff.tool_call_delta.name},
|
||||||
|
{"type", "function_call"},
|
||||||
|
{"status", "in_progress"},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
oai_resp_fc_id = diff.tool_call_delta.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!diff.tool_call_delta.arguments.empty()) {
|
||||||
|
events.push_back(json {
|
||||||
|
{"event", "response.function_call_arguments.delta"},
|
||||||
|
{"data", json {
|
||||||
|
{"type", "response.function_call_arguments.delta"},
|
||||||
|
{"delta", diff.tool_call_delta.arguments},
|
||||||
|
{"item_id", "fc_" + oai_resp_fc_id},
|
||||||
|
}},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return events;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// server_task_result_embd
|
// server_task_result_embd
|
||||||
//
|
//
|
||||||
|
|
@ -1260,8 +1600,8 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||||
|
|
||||||
// use local copies of streaming state (copied from task_result_state in update())
|
// use local copies of streaming state (copied from task_result_state in update())
|
||||||
// these reflect the state BEFORE this chunk was processed
|
// these reflect the state BEFORE this chunk was processed
|
||||||
bool thinking_started = anthropic_thinking_block_started;
|
bool thinking_started = thinking_block_started;
|
||||||
bool text_started = anthropic_text_block_started;
|
bool text_started = text_block_started;
|
||||||
|
|
||||||
for (const auto & diff : oaicompat_msg_diffs) {
|
for (const auto & diff : oaicompat_msg_diffs) {
|
||||||
// handle thinking/reasoning content
|
// handle thinking/reasoning content
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@ enum task_response_type {
|
||||||
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
||||||
TASK_RESPONSE_TYPE_OAI_CHAT,
|
TASK_RESPONSE_TYPE_OAI_CHAT,
|
||||||
TASK_RESPONSE_TYPE_OAI_CMPL,
|
TASK_RESPONSE_TYPE_OAI_CMPL,
|
||||||
|
TASK_RESPONSE_TYPE_OAI_RESP,
|
||||||
TASK_RESPONSE_TYPE_OAI_EMBD,
|
TASK_RESPONSE_TYPE_OAI_EMBD,
|
||||||
TASK_RESPONSE_TYPE_ANTHROPIC,
|
TASK_RESPONSE_TYPE_ANTHROPIC,
|
||||||
};
|
};
|
||||||
|
|
@ -98,12 +99,22 @@ struct task_result_state {
|
||||||
std::string generated_text; // append new chunks of generated text here
|
std::string generated_text; // append new chunks of generated text here
|
||||||
std::vector<std::string> generated_tool_call_ids;
|
std::vector<std::string> generated_tool_call_ids;
|
||||||
|
|
||||||
// for Anthropic API streaming: track content block state across chunks
|
// for OpenAI Responses and Anthropic streaming API:
|
||||||
bool anthropic_thinking_block_started = false;
|
// track output item / content block state across chunks
|
||||||
bool anthropic_text_block_started = false;
|
bool thinking_block_started = false;
|
||||||
|
bool text_block_started = false;
|
||||||
|
|
||||||
|
// for OpenAI Responses streaming API
|
||||||
|
const std::string oai_resp_id;
|
||||||
|
const std::string oai_resp_reasoning_id;
|
||||||
|
const std::string oai_resp_message_id;
|
||||||
|
std::string oai_resp_fc_id; // function call ID for current args delta
|
||||||
|
|
||||||
task_result_state(const common_chat_parser_params & chat_parser_params)
|
task_result_state(const common_chat_parser_params & chat_parser_params)
|
||||||
: chat_parser_params(chat_parser_params) {}
|
: chat_parser_params(chat_parser_params)
|
||||||
|
, oai_resp_id("resp_" + random_string())
|
||||||
|
, oai_resp_reasoning_id("rs_" + random_string())
|
||||||
|
, oai_resp_message_id("msg_" + random_string()) {}
|
||||||
|
|
||||||
// parse partial tool calls and update the internal state
|
// parse partial tool calls and update the internal state
|
||||||
common_chat_msg update_chat_msg(
|
common_chat_msg update_chat_msg(
|
||||||
|
|
@ -352,6 +363,11 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||||
bool is_updated = false;
|
bool is_updated = false;
|
||||||
|
|
||||||
|
// for OpenAI Responses API
|
||||||
|
std::string oai_resp_id;
|
||||||
|
std::string oai_resp_reasoning_id;
|
||||||
|
std::string oai_resp_message_id;
|
||||||
|
|
||||||
virtual bool is_stop() override {
|
virtual bool is_stop() override {
|
||||||
return true; // in stream mode, final responses are considered stop
|
return true; // in stream mode, final responses are considered stop
|
||||||
}
|
}
|
||||||
|
|
@ -361,6 +377,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
virtual void update(task_result_state & state) override {
|
virtual void update(task_result_state & state) override {
|
||||||
is_updated = true;
|
is_updated = true;
|
||||||
oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
|
oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
|
||||||
|
|
||||||
|
oai_resp_id = state.oai_resp_id;
|
||||||
|
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
||||||
|
oai_resp_message_id = state.oai_resp_message_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_non_oaicompat();
|
json to_json_non_oaicompat();
|
||||||
|
|
@ -371,6 +391,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
|
|
||||||
json to_json_oaicompat_chat_stream();
|
json to_json_oaicompat_chat_stream();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp_stream();
|
||||||
|
|
||||||
json to_json_anthropic();
|
json to_json_anthropic();
|
||||||
|
|
||||||
json to_json_anthropic_stream();
|
json to_json_anthropic_stream();
|
||||||
|
|
@ -397,45 +421,35 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||||
bool is_updated = false;
|
bool is_updated = false;
|
||||||
|
|
||||||
|
// Streaming state copied from task_result_state for this chunk
|
||||||
|
bool thinking_block_started = false;
|
||||||
|
bool text_block_started = false;
|
||||||
|
|
||||||
|
// for OpenAI Responses API
|
||||||
|
std::string oai_resp_id;
|
||||||
|
std::string oai_resp_reasoning_id;
|
||||||
|
std::string oai_resp_message_id;
|
||||||
|
std::string oai_resp_fc_id;
|
||||||
|
|
||||||
// for Anthropic API: track if any reasoning content has been generated
|
// for Anthropic API: track if any reasoning content has been generated
|
||||||
bool anthropic_has_reasoning = false;
|
bool anthropic_has_reasoning = false;
|
||||||
// Streaming state copied from task_result_state for this chunk
|
|
||||||
bool anthropic_thinking_block_started = false;
|
|
||||||
bool anthropic_text_block_started = false;
|
|
||||||
|
|
||||||
virtual bool is_stop() override {
|
virtual bool is_stop() override {
|
||||||
return false; // in stream mode, partial responses are not considered stop
|
return false; // in stream mode, partial responses are not considered stop
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void update(task_result_state & state) override;
|
||||||
|
|
||||||
virtual json to_json() override;
|
virtual json to_json() override;
|
||||||
|
|
||||||
virtual void update(task_result_state & state) override {
|
|
||||||
is_updated = true;
|
|
||||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
|
||||||
// track if the accumulated message has any reasoning content
|
|
||||||
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
|
||||||
|
|
||||||
// Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
|
|
||||||
anthropic_thinking_block_started = state.anthropic_thinking_block_started;
|
|
||||||
anthropic_text_block_started = state.anthropic_text_block_started;
|
|
||||||
|
|
||||||
// Pre-compute state updates based on diffs (for next chunk)
|
|
||||||
for (const auto & diff : oaicompat_msg_diffs) {
|
|
||||||
if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
|
|
||||||
state.anthropic_thinking_block_started = true;
|
|
||||||
}
|
|
||||||
if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
|
|
||||||
state.anthropic_text_block_started = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
json to_json_non_oaicompat();
|
json to_json_non_oaicompat();
|
||||||
|
|
||||||
json to_json_oaicompat();
|
json to_json_oaicompat();
|
||||||
|
|
||||||
json to_json_oaicompat_chat();
|
json to_json_oaicompat_chat();
|
||||||
|
|
||||||
|
json to_json_oaicompat_resp();
|
||||||
|
|
||||||
json to_json_anthropic();
|
json to_json_anthropic();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -140,6 +140,7 @@ int main(int argc, char ** argv) {
|
||||||
routes.post_completions = models_routes->proxy_post;
|
routes.post_completions = models_routes->proxy_post;
|
||||||
routes.post_completions_oai = models_routes->proxy_post;
|
routes.post_completions_oai = models_routes->proxy_post;
|
||||||
routes.post_chat_completions = models_routes->proxy_post;
|
routes.post_chat_completions = models_routes->proxy_post;
|
||||||
|
routes.post_responses_oai = models_routes->proxy_post;
|
||||||
routes.post_anthropic_messages = models_routes->proxy_post;
|
routes.post_anthropic_messages = models_routes->proxy_post;
|
||||||
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
routes.post_anthropic_count_tokens = models_routes->proxy_post;
|
||||||
routes.post_infill = models_routes->proxy_post;
|
routes.post_infill = models_routes->proxy_post;
|
||||||
|
|
@ -176,6 +177,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||||
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||||
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
||||||
|
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
|
||||||
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
||||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||||
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ aiohttp~=3.9.3
|
||||||
pytest~=8.3.3
|
pytest~=8.3.3
|
||||||
huggingface_hub>=0.34.0,<1.0
|
huggingface_hub>=0.34.0,<1.0
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.55.3
|
openai~=2.14.0
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
wget~=3.2
|
wget~=3.2
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
import pytest
|
||||||
|
from openai import OpenAI
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
server: ServerProcess
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def create_server():
|
||||||
|
global server
|
||||||
|
server = ServerPreset.tinyllama2()
|
||||||
|
|
||||||
|
def test_responses_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
res = client.responses.create(
|
||||||
|
model="gpt-4.1",
|
||||||
|
input=[
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
],
|
||||||
|
max_output_tokens=8,
|
||||||
|
temperature=0.8,
|
||||||
|
)
|
||||||
|
assert res.id.startswith("resp_")
|
||||||
|
assert res.output[0].id is not None
|
||||||
|
assert res.output[0].id.startswith("msg_")
|
||||||
|
assert match_regex("(Suddenly)+", res.output_text)
|
||||||
|
|
||||||
|
def test_responses_stream_with_openai_library():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
stream = client.responses.create(
|
||||||
|
model="gpt-4.1",
|
||||||
|
input=[
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
],
|
||||||
|
max_output_tokens=8,
|
||||||
|
temperature=0.8,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
gathered_text = ''
|
||||||
|
resp_id = ''
|
||||||
|
msg_id = ''
|
||||||
|
for r in stream:
|
||||||
|
if r.type == "response.created":
|
||||||
|
assert r.response.id.startswith("resp_")
|
||||||
|
resp_id = r.response.id
|
||||||
|
if r.type == "response.in_progress":
|
||||||
|
assert r.response.id == resp_id
|
||||||
|
if r.type == "response.output_item.added":
|
||||||
|
assert r.item.id is not None
|
||||||
|
assert r.item.id.startswith("msg_")
|
||||||
|
msg_id = r.item.id
|
||||||
|
if (r.type == "response.content_part.added" or
|
||||||
|
r.type == "response.output_text.delta" or
|
||||||
|
r.type == "response.output_text.done" or
|
||||||
|
r.type == "response.content_part.done"):
|
||||||
|
assert r.item_id == msg_id
|
||||||
|
if r.type == "response.output_item.done":
|
||||||
|
assert r.item.id == msg_id
|
||||||
|
|
||||||
|
if r.type == "response.output_text.delta":
|
||||||
|
gathered_text += r.delta
|
||||||
|
if r.type == "response.completed":
|
||||||
|
assert r.response.id.startswith("resp_")
|
||||||
|
assert r.response.output[0].id is not None
|
||||||
|
assert r.response.output[0].id.startswith("msg_")
|
||||||
|
assert gathered_text == r.response.output_text
|
||||||
|
assert match_regex("(Suddenly)+", r.response.output_text)
|
||||||
Loading…
Reference in New Issue