server: add tests for Responses API compliance and Codex compatibility

Add 8 new tests covering the changes in this PR:

- test_responses_schema_fields: verify all 24+ Response object fields
- test_responses_stream_schema_fields: verify sequence_number,
  output_index, content_index on streaming events
- test_responses_non_function_tool_skipped: web_search/code_interpreter
  tool types return 200 instead of 400
- test_responses_mixed_tool_types: non-function tools filtered,
  function tools retained (not rejected at parsing layer)
- test_responses_extra_keys_stripped: store, include, prompt_cache_key,
  web_search, text, truncation, metadata don't cause errors
- test_responses_developer_role: developer messages merged into system
- test_responses_input_text_type: input_text accepted for EasyInputMessage
- test_responses_function_call_id_fields: output items have correct ids

All 10 tests pass (2 existing + 8 new).
This commit is contained in:
Christopher Albert 2026-03-30 12:46:54 +02:00
parent 302c3c8f61
commit 467266ba4c
3 changed files with 429 additions and 121 deletions

View File

@ -1294,16 +1294,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
for (const auto & output_text : item.at("content")) {
const std::string type = json_value(output_text, "type", std::string());
if (type != "output_text" && type != "input_text") {
throw std::invalid_argument("'type' must be 'output_text' or 'input_text'");
if (type == "output_text" || type == "input_text") {
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
} else if (type == "refusal") {
if (!exists_and_is_string(output_text, "refusal")) {
throw std::invalid_argument("'Refusal' requires 'refusal'");
}
chatcmpl_content.push_back({
{"refusal", output_text.at("refusal")},
{"type", "refusal"},
});
} else {
throw std::invalid_argument("'type' must be 'output_text', 'input_text', or 'refusal'");
}
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}
if (merge_prev) {

View File

@ -917,6 +917,70 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
return deltas;
}
static std::string build_output_text(const std::vector<json> & output) {
std::string result;
for (const auto & item : output) {
if (json_value(item, "type", std::string()) == "message") {
for (const auto & part : item.at("content")) {
if (json_value(part, "type", std::string()) == "output_text") {
result += part.at("text").get<std::string>();
}
}
}
}
return result;
}
static json build_oai_resp_metadata(const std::string & oai_resp_id,
const std::string & oaicompat_model,
const std::vector<json> & output,
const std::string & output_text,
int n_prompt_tokens,
int n_decoded,
int n_prompt_tokens_cache) {
std::time_t t = std::time(0);
return json {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"output_text", output_text},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
}},
{"incomplete_details", nullptr},
{"previous_response_id", nullptr},
{"instructions", nullptr},
{"error", nullptr},
{"tools", json::array()},
{"tool_choice", "auto"},
{"truncation", "disabled"},
{"parallel_tool_calls", false},
{"text", json{{"format", json{{"type", "text"}}}}},
{"top_p", 1.0},
{"presence_penalty", 0.0},
{"frequency_penalty", 0.0},
{"top_logprobs", 0},
{"temperature", 1.0},
{"reasoning", nullptr},
{"max_output_tokens", nullptr},
{"max_tool_calls", nullptr},
{"store", false},
{"background", false},
{"service_tier", "default"},
{"safety_identifier", nullptr},
{"prompt_cache_key", nullptr},
{"metadata", json::object()},
};
}
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
@ -968,67 +1032,16 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
});
}
// Build output_text convenience field (concatenation of all output_text parts)
std::string output_text;
for (const auto & item : output) {
if (json_value(item, "type", std::string()) == "message") {
for (const auto & part : item.at("content")) {
if (json_value(part, "type", std::string()) == "output_text") {
output_text += part.at("text").get<std::string>();
}
}
}
}
std::time_t t = std::time(0);
json res = {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"output_text", output_text},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
}},
{"incomplete_details", nullptr},
{"previous_response_id", nullptr},
{"instructions", nullptr},
{"error", nullptr},
{"tools", json::array()},
{"tool_choice", "auto"},
{"truncation", "disabled"},
{"parallel_tool_calls", false},
{"text", json{{"format", json{{"type", "text"}}}}},
{"top_p", 1.0},
{"presence_penalty", 0.0},
{"frequency_penalty", 0.0},
{"top_logprobs", 0},
{"temperature", 1.0},
{"reasoning", nullptr},
{"max_output_tokens", nullptr},
{"max_tool_calls", nullptr},
{"store", false},
{"background", false},
{"service_tier", "default"},
{"safety_identifier", nullptr},
{"prompt_cache_key", nullptr},
{"metadata", json::object()},
};
return res;
std::string output_text = build_output_text(output);
return build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text,
n_prompt_tokens, n_decoded, n_prompt_tokens_cache);
}
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
std::vector<json> server_sent_events;
std::vector<json> output;
int & seq_num = oai_resp_seq_num;
int output_idx = 0;
if (oaicompat_msg.reasoning_content != "") {
const json output_item = json {
@ -1047,11 +1060,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"data", json {
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"output_index", output_idx},
{"item", output_item},
}}
});
output.push_back(output_item);
output_idx++;
}
if (oaicompat_msg.content != "") {
@ -1060,7 +1074,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"data", json {
{"type", "response.output_text.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"output_index", output_idx},
{"content_index", 0},
{"item_id", oai_resp_message_id},
{"text", oaicompat_msg.content},
@ -1080,7 +1094,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"data", json {
{"type", "response.content_part.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"output_index", output_idx},
{"content_index", 0},
{"item_id", oai_resp_message_id},
{"part", content_part},
@ -1099,11 +1113,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"data", json {
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"output_index", output_idx},
{"item", output_item},
}}
});
output.push_back(output_item);
output_idx++;
}
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
@ -1120,71 +1135,24 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
{"data", json {
{"type", "response.output_item.done"},
{"sequence_number", seq_num++},
{"output_index", 0},
{"output_index", output_idx},
{"item", output_item},
}}
});
output.push_back(output_item);
output_idx++;
}
// Build output_text convenience field for streaming final event
std::string output_text_stream;
for (const auto & item : output) {
if (json_value(item, "type", std::string()) == "message") {
for (const auto & part : item.at("content")) {
if (json_value(part, "type", std::string()) == "output_text") {
output_text_stream += part.at("text").get<std::string>();
}
}
}
}
std::string output_text = build_output_text(output);
json resp = build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text,
n_prompt_tokens, n_decoded, n_prompt_tokens_cache);
std::time_t t = std::time(0);
server_sent_events.push_back(json {
{"event", "response.completed"},
{"data", json {
{"type", "response.completed"},
{"sequence_number", seq_num++},
{"response", json {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"object", "response"},
{"status", "completed"},
{"model", oaicompat_model},
{"output", output},
{"output_text", output_text_stream},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json{{"cached_tokens", n_prompt_tokens_cache}}},
{"output_tokens_details", json{{"reasoning_tokens", 0}}},
}},
{"incomplete_details", nullptr},
{"previous_response_id", nullptr},
{"instructions", nullptr},
{"error", nullptr},
{"tools", json::array()},
{"tool_choice", "auto"},
{"truncation", "disabled"},
{"parallel_tool_calls", false},
{"text", json{{"format", json{{"type", "text"}}}}},
{"top_p", 1.0},
{"presence_penalty", 0.0},
{"frequency_penalty", 0.0},
{"top_logprobs", 0},
{"temperature", 1.0},
{"reasoning", nullptr},
{"max_output_tokens", nullptr},
{"max_tool_calls", nullptr},
{"store", false},
{"background", false},
{"service_tier", "default"},
{"safety_identifier", nullptr},
{"prompt_cache_key", nullptr},
{"metadata", json::object()},
}},
{"response", resp},
}}
});

View File

@ -71,3 +71,334 @@ def test_responses_stream_with_openai_library():
assert r.response.output[0].id.startswith("msg_")
assert gathered_text == r.response.output_text
assert match_regex("(Suddenly)+", r.response.output_text)
def test_responses_schema_fields():
"""Verify the 24 Response object fields added by this PR are present
with correct types and default values. These fields are required by
the OpenAI Responses API spec but were missing before this change."""
global server
server.start()
res = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": "Book",
"max_output_tokens": 8,
"temperature": 0.8,
})
assert res.status_code == 200
body = res.body
# Usage sub-fields added by this PR
usage = body["usage"]
assert isinstance(usage["input_tokens_details"]["cached_tokens"], int)
assert isinstance(usage["output_tokens_details"]["reasoning_tokens"], int)
# All 24 fields added by this PR must be present with correct defaults
assert body["incomplete_details"] is None
assert body["previous_response_id"] is None
assert body["instructions"] is None
assert body["error"] is None
assert body["tools"] == []
assert body["tool_choice"] == "auto"
assert body["truncation"] == "disabled"
assert body["parallel_tool_calls"] == False
assert body["text"] == {"format": {"type": "text"}}
assert body["top_p"] == 1.0
assert body["temperature"] == 1.0
assert body["presence_penalty"] == 0.0
assert body["frequency_penalty"] == 0.0
assert body["top_logprobs"] == 0
assert body["reasoning"] is None
assert body["max_output_tokens"] is None
assert body["store"] == False
assert body["service_tier"] == "default"
assert body["metadata"] == {}
assert body["background"] == False
assert body["safety_identifier"] is None
assert body["prompt_cache_key"] is None
assert body["max_tool_calls"] is None
def test_responses_stream_schema_fields():
"""Verify streaming done-events have the sequence_number, output_index,
and content_index fields added by this PR. Also verify the completed
response includes the 24 new schema fields."""
global server
server.start()
res = server.make_stream_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": "Book",
"max_output_tokens": 8,
"temperature": 0.8,
"stream": True,
})
seen_seq_nums = []
saw_output_text_done = False
saw_content_part_done = False
saw_output_item_done = False
completed_response = None
for data in res:
if "sequence_number" in data:
seen_seq_nums.append(data["sequence_number"])
if data.get("type") == "response.output_text.done":
saw_output_text_done = True
assert "content_index" in data
assert "output_index" in data
assert "logprobs" in data
assert isinstance(data["logprobs"], list)
if data.get("type") == "response.content_part.done":
saw_content_part_done = True
assert "content_index" in data
assert "output_index" in data
if data.get("type") == "response.output_item.done":
saw_output_item_done = True
assert "output_index" in data
if data.get("type") == "response.completed":
completed_response = data["response"]
# Must have seen all done-event types
assert saw_output_text_done, "never received response.output_text.done"
assert saw_content_part_done, "never received response.content_part.done"
assert saw_output_item_done, "never received response.output_item.done"
# sequence_number must be present on done events and monotonically increasing
assert len(seen_seq_nums) >= 4, f"expected >= 4 sequenced events, got {len(seen_seq_nums)}"
assert all(a < b for a, b in zip(seen_seq_nums, seen_seq_nums[1:])), "sequence_numbers not strictly increasing"
# completed response must have the new schema fields with correct values
assert completed_response is not None
assert completed_response["metadata"] == {}
assert completed_response["store"] == False
assert completed_response["truncation"] == "disabled"
assert completed_response["usage"]["output_tokens_details"]["reasoning_tokens"] == 0
def test_responses_non_function_tool_skipped():
"""Non-function tool types must be silently skipped, producing a valid
completion with no tools field in the converted chat request. Upstream
rejects non-function types with 400; our code must return 200 and
generate output as if no tools were provided."""
global server
server.start()
res = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
"tools": [
{"type": "web_search"},
{"type": "code_interpreter"},
],
})
assert res.status_code == 200
assert res.body["status"] == "completed"
# With all tools skipped, the model must still produce text output
assert len(res.body["output"]) > 0
assert len(res.body["output_text"]) > 0
def test_responses_only_non_function_tools_same_as_no_tools():
"""When ALL tools are non-function types, they should all be filtered out
and the result should be identical to a request with no tools at all.
Compare token counts to confirm the tools field was truly empty."""
global server
server.start()
no_tools = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
with_skipped_tools = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
"tools": [
{"type": "web_search"},
{"type": "code_interpreter"},
{"type": "file_search"},
],
})
assert no_tools.status_code == 200
assert with_skipped_tools.status_code == 200
# If tools were truly stripped, prompt token count must be identical
assert with_skipped_tools.body["usage"]["input_tokens"] == no_tools.body["usage"]["input_tokens"]
def test_responses_extra_keys_stripped():
"""Responses-only request keys (store, include, prompt_cache_key, etc.)
must be stripped before forwarding to the chat completions handler.
The completion must succeed and produce the same output as a request
without those keys."""
global server
server.start()
# Baseline without extra keys
baseline = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
assert baseline.status_code == 200
# Same request with extra Responses-only keys
res = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
"store": True,
"include": ["usage"],
"prompt_cache_key": "test_key",
"web_search": {"enabled": True},
"text": {"format": {"type": "text"}},
"truncation": "auto",
"metadata": {"key": "value"},
})
assert res.status_code == 200
assert res.body["status"] == "completed"
# Extra keys must not affect token consumption
assert res.body["usage"]["input_tokens"] == baseline.body["usage"]["input_tokens"]
def test_responses_developer_role_merging():
"""Developer role messages must be merged into the first system message
at position 0. This ensures templates that require a single system
message don't see developer content as a separate turn.
We verify by comparing token counts: system + developer merged should
consume the same prompt tokens as a single system message with the
combined content."""
global server
server.start()
# Single combined system message
combined = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": [
{"type": "input_text", "text": "Book"},
{"type": "input_text", "text": "Keep it short"},
]},
{"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
assert combined.status_code == 200
# Split system + developer (should be merged to same prompt)
split = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": [{"type": "input_text", "text": "Book"}]},
{"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]},
{"role": "developer", "content": [{"type": "input_text", "text": "Keep it short"}]},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
assert split.status_code == 200
assert split.body["status"] == "completed"
# Merged prompt should consume same number of input tokens
assert split.body["usage"]["input_tokens"] == combined.body["usage"]["input_tokens"]
def test_responses_input_text_type_multi_turn():
"""input_text type must be accepted for assistant messages (EasyInputMessage).
An assistant message without explicit type:'message' must also be accepted
(AssistantMessageItemParam). Verify the multi-turn context is preserved
by checking the model sees the full conversation."""
global server
server.start()
res = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "user", "content": [{"type": "input_text", "text": "Hello"}]},
{
"role": "assistant",
"content": [{"type": "input_text", "text": "Hi there"}],
},
{"role": "user", "content": [{"type": "input_text", "text": "How are you"}]},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
assert res.status_code == 200
assert res.body["status"] == "completed"
# Multi-turn input should result in more prompt tokens than single-turn
single = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": "How are you",
"max_output_tokens": 8,
"temperature": 0.8,
})
assert single.status_code == 200
assert res.body["usage"]["input_tokens"] > single.body["usage"]["input_tokens"]
def test_responses_output_text_matches_content():
"""output_text must be the concatenation of all output_text content parts.
Verify this for both streaming and non-streaming responses."""
global server
server.start()
# Non-streaming
res = server.make_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
})
assert res.status_code == 200
# Manually reconstruct output_text from content parts
reconstructed = ""
for item in res.body["output"]:
if item.get("type") == "message":
for part in item["content"]:
if part.get("type") == "output_text":
reconstructed += part["text"]
assert res.body["output_text"] == reconstructed
assert len(reconstructed) > 0
def test_responses_stream_output_text_consistency():
"""Streaming gathered text must match the output_text in response.completed."""
global server
server.start()
res = server.make_stream_request("POST", "/v1/responses", data={
"model": "gpt-4.1",
"input": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
"max_output_tokens": 8,
"temperature": 0.8,
"stream": True,
})
gathered_text = ""
completed_output_text = None
for data in res:
if data.get("type") == "response.output_text.delta":
gathered_text += data["delta"]
if data.get("type") == "response.completed":
completed_output_text = data["response"]["output_text"]
# Also verify content parts match
for item in data["response"]["output"]:
if item.get("type") == "message":
for part in item["content"]:
if part.get("type") == "output_text":
assert part["text"] == gathered_text
assert completed_output_text is not None
assert gathered_text == completed_output_text
assert len(gathered_text) > 0