server: add tests for Responses API compliance and Codex compatibility

Add 8 new tests covering the changes in this PR: - test_responses_schema_fields: verify all 24+ Response object fields - test_responses_stream_schema_fields: verify sequence_number, output_index, content_index on streaming events - test_responses_non_function_tool_skipped: web_search/code_interpreter tool types return 200 instead of 400 - test_responses_mixed_tool_types: non-function tools filtered, function tools retained (not rejected at parsing layer) - test_responses_extra_keys_stripped: store, include, prompt_cache_key, web_search, text, truncation, metadata don't cause errors - test_responses_developer_role: developer messages merged into system - test_responses_input_text_type: input_text accepted for EasyInputMessage - test_responses_function_call_id_fields: output items have correct ids All 10 tests pass (2 existing + 8 new).
2026-03-30 12:46:54 +02:00 · 2026-03-30 12:46:54 +02:00 · 0d521c072d
parent 1aa3dec0d6
commit 0d521c072d
3 changed files with 429 additions and 121 deletions
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1294,16 +1294,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                for (const auto & output_text : item.at("content")) {
                    const std::string type = json_value(output_text, "type", std::string());
-                    if (type != "output_text" && type != "input_text") {
+                    if (type == "output_text" || type == "input_text") {
-                        throw std::invalid_argument("'type' must be 'output_text' or 'input_text'");
+                        if (!exists_and_is_string(output_text, "text")) {
                            throw std::invalid_argument("'Output text' requires 'text'");
                        }
                        chatcmpl_content.push_back({
                            {"text", output_text.at("text")},
                            {"type", "text"},
                        });
                    } else if (type == "refusal") {
                        if (!exists_and_is_string(output_text, "refusal")) {
                            throw std::invalid_argument("'Refusal' requires 'refusal'");
                        }
                        chatcmpl_content.push_back({
                            {"refusal", output_text.at("refusal")},
                            {"type", "refusal"},
                        });
                    } else {
                        throw std::invalid_argument("'type' must be 'output_text', 'input_text', or 'refusal'");
                    }
                    if (!exists_and_is_string(output_text, "text")) {
                        throw std::invalid_argument("'Output text' requires 'text'");
                    }
                    chatcmpl_content.push_back({
                        {"text", output_text.at("text")},
                        {"type", "text"},
                    });
                }
                if (merge_prev) {
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -917,6 +917,70 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
    return deltas;
 }
 static std::string build_output_text(const std::vector<json> & output) {
    std::string result;
    for (const auto & item : output) {
        if (json_value(item, "type", std::string()) == "message") {
            for (const auto & part : item.at("content")) {
                if (json_value(part, "type", std::string()) == "output_text") {
                    result += part.at("text").get<std::string>();
                }
            }
        }
    }
    return result;
 }
 static json build_oai_resp_metadata(const std::string & oai_resp_id,
                                    const std::string & oaicompat_model,
                                    const std::vector<json> & output,
                                    const std::string & output_text,
                                    int n_prompt_tokens,
                                    int n_decoded,
                                    int n_prompt_tokens_cache) {
    std::time_t t = std::time(0);
    return json {
        {"completed_at",         t},
        {"created_at",           t},
        {"id",                   oai_resp_id},
        {"model",                oaicompat_model},
        {"object",               "response"},
        {"output",               output},
        {"output_text",          output_text},
        {"status",               "completed"},
        {"usage",                json {
            {"input_tokens",          n_prompt_tokens},
            {"output_tokens",         n_decoded},
            {"total_tokens",          n_decoded + n_prompt_tokens},
            {"input_tokens_details",  json{{"cached_tokens", n_prompt_tokens_cache}}},
            {"output_tokens_details", json{{"reasoning_tokens", 0}}},
        }},
        {"incomplete_details",   nullptr},
        {"previous_response_id", nullptr},
        {"instructions",         nullptr},
        {"error",                nullptr},
        {"tools",                json::array()},
        {"tool_choice",          "auto"},
        {"truncation",           "disabled"},
        {"parallel_tool_calls",  false},
        {"text",                 json{{"format", json{{"type", "text"}}}}},
        {"top_p",                1.0},
        {"presence_penalty",     0.0},
        {"frequency_penalty",    0.0},
        {"top_logprobs",         0},
        {"temperature",          1.0},
        {"reasoning",            nullptr},
        {"max_output_tokens",    nullptr},
        {"max_tool_calls",       nullptr},
        {"store",                false},
        {"background",           false},
        {"service_tier",         "default"},
        {"safety_identifier",    nullptr},
        {"prompt_cache_key",     nullptr},
        {"metadata",             json::object()},
    };
 }
 json server_task_result_cmpl_final::to_json_oaicompat_resp() {
    common_chat_msg msg;
    if (!oaicompat_msg.empty()) {
@ -968,67 +1032,16 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() {
        });
    }
-    // Build output_text convenience field (concatenation of all output_text parts)
+    std::string output_text = build_output_text(output);
-    std::string output_text;
+    return build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text,
-    for (const auto & item : output) {
+                                   n_prompt_tokens, n_decoded, n_prompt_tokens_cache);
        if (json_value(item, "type", std::string()) == "message") {
            for (const auto & part : item.at("content")) {
                if (json_value(part, "type", std::string()) == "output_text") {
                    output_text += part.at("text").get<std::string>();
                }
            }
        }
    }
    std::time_t t = std::time(0);
    json res = {
        {"completed_at",         t},
        {"created_at",           t},
        {"id",                   oai_resp_id},
        {"model",                oaicompat_model},
        {"object",               "response"},
        {"output",               output},
        {"output_text",          output_text},
        {"status",               "completed"},
        {"usage",                json {
            {"input_tokens",          n_prompt_tokens},
            {"output_tokens",         n_decoded},
            {"total_tokens",          n_decoded + n_prompt_tokens},
            {"input_tokens_details",  json{{"cached_tokens", n_prompt_tokens_cache}}},
            {"output_tokens_details", json{{"reasoning_tokens", 0}}},
        }},
        {"incomplete_details",   nullptr},
        {"previous_response_id", nullptr},
        {"instructions",         nullptr},
        {"error",                nullptr},
        {"tools",                json::array()},
        {"tool_choice",          "auto"},
        {"truncation",           "disabled"},
        {"parallel_tool_calls",  false},
        {"text",                 json{{"format", json{{"type", "text"}}}}},
        {"top_p",                1.0},
        {"presence_penalty",     0.0},
        {"frequency_penalty",    0.0},
        {"top_logprobs",         0},
        {"temperature",          1.0},
        {"reasoning",            nullptr},
        {"max_output_tokens",    nullptr},
        {"max_tool_calls",       nullptr},
        {"store",                false},
        {"background",           false},
        {"service_tier",         "default"},
        {"safety_identifier",    nullptr},
        {"prompt_cache_key",     nullptr},
        {"metadata",             json::object()},
    };
    return res;
 }
 json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
    std::vector<json> server_sent_events;
    std::vector<json> output;
    int & seq_num = oai_resp_seq_num;
    int output_idx = 0;
    if (oaicompat_msg.reasoning_content != "") {
        const json output_item = json {
@ -1047,11 +1060,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
            {"data", json {
                {"type",            "response.output_item.done"},
                {"sequence_number", seq_num++},
-                {"output_index",    0},
+                {"output_index",    output_idx},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
        output_idx++;
    }
    if (oaicompat_msg.content != "") {
@ -1060,7 +1074,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
            {"data", json {
                {"type",            "response.output_text.done"},
                {"sequence_number", seq_num++},
-                {"output_index",    0},
+                {"output_index",    output_idx},
                {"content_index",   0},
                {"item_id",         oai_resp_message_id},
                {"text",            oaicompat_msg.content},
@ -1080,7 +1094,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
            {"data", json {
                {"type",            "response.content_part.done"},
                {"sequence_number", seq_num++},
-                {"output_index",    0},
+                {"output_index",    output_idx},
                {"content_index",   0},
                {"item_id",         oai_resp_message_id},
                {"part",            content_part},
@ -1099,11 +1113,12 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
            {"data", json {
                {"type",            "response.output_item.done"},
                {"sequence_number", seq_num++},
-                {"output_index",    0},
+                {"output_index",    output_idx},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
        output_idx++;
    }
    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
@ -1120,71 +1135,24 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
            {"data", json {
                {"type",            "response.output_item.done"},
                {"sequence_number", seq_num++},
-                {"output_index",    0},
+                {"output_index",    output_idx},
                {"item",            output_item},
            }}
        });
        output.push_back(output_item);
        output_idx++;
    }
-    // Build output_text convenience field for streaming final event
+    std::string output_text = build_output_text(output);
-    std::string output_text_stream;
+    json resp = build_oai_resp_metadata(oai_resp_id, oaicompat_model, output, output_text,
-    for (const auto & item : output) {
+                                        n_prompt_tokens, n_decoded, n_prompt_tokens_cache);
        if (json_value(item, "type", std::string()) == "message") {
            for (const auto & part : item.at("content")) {
                if (json_value(part, "type", std::string()) == "output_text") {
                    output_text_stream += part.at("text").get<std::string>();
                }
            }
        }
    }
    std::time_t t = std::time(0);
    server_sent_events.push_back(json {
        {"event", "response.completed"},
        {"data", json {
            {"type",            "response.completed"},
            {"sequence_number", seq_num++},
-            {"response", json {
+            {"response",        resp},
                {"completed_at",         t},
                {"created_at",           t},
                {"id",                   oai_resp_id},
                {"object",               "response"},
                {"status",               "completed"},
                {"model",                oaicompat_model},
                {"output",               output},
                {"output_text",          output_text_stream},
                {"usage",                json {
                    {"input_tokens",          n_prompt_tokens},
                    {"output_tokens",         n_decoded},
                    {"total_tokens",          n_decoded + n_prompt_tokens},
                    {"input_tokens_details",  json{{"cached_tokens", n_prompt_tokens_cache}}},
                    {"output_tokens_details", json{{"reasoning_tokens", 0}}},
                }},
                {"incomplete_details",   nullptr},
                {"previous_response_id", nullptr},
                {"instructions",         nullptr},
                {"error",                nullptr},
                {"tools",                json::array()},
                {"tool_choice",          "auto"},
                {"truncation",           "disabled"},
                {"parallel_tool_calls",  false},
                {"text",                 json{{"format", json{{"type", "text"}}}}},
                {"top_p",                1.0},
                {"presence_penalty",     0.0},
                {"frequency_penalty",    0.0},
                {"top_logprobs",         0},
                {"temperature",          1.0},
                {"reasoning",            nullptr},
                {"max_output_tokens",    nullptr},
                {"max_tool_calls",       nullptr},
                {"store",                false},
                {"background",           false},
                {"service_tier",         "default"},
                {"safety_identifier",    nullptr},
                {"prompt_cache_key",     nullptr},
                {"metadata",             json::object()},
            }},
        }}
    });
--- a/tools/server/tests/unit/test_compat_oai_responses.py
+++ b/tools/server/tests/unit/test_compat_oai_responses.py
@ -71,3 +71,334 @@ def test_responses_stream_with_openai_library():
            assert r.response.output[0].id.startswith("msg_")
            assert gathered_text == r.response.output_text
            assert match_regex("(Suddenly)+", r.response.output_text)
 def test_responses_schema_fields():
    """Verify the 24 Response object fields added by this PR are present
    with correct types and default values. These fields are required by
    the OpenAI Responses API spec but were missing before this change."""
    global server
    server.start()
    res = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": "Book",
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert res.status_code == 200
    body = res.body
    # Usage sub-fields added by this PR
    usage = body["usage"]
    assert isinstance(usage["input_tokens_details"]["cached_tokens"], int)
    assert isinstance(usage["output_tokens_details"]["reasoning_tokens"], int)
    # All 24 fields added by this PR must be present with correct defaults
    assert body["incomplete_details"] is None
    assert body["previous_response_id"] is None
    assert body["instructions"] is None
    assert body["error"] is None
    assert body["tools"] == []
    assert body["tool_choice"] == "auto"
    assert body["truncation"] == "disabled"
    assert body["parallel_tool_calls"] == False
    assert body["text"] == {"format": {"type": "text"}}
    assert body["top_p"] == 1.0
    assert body["temperature"] == 1.0
    assert body["presence_penalty"] == 0.0
    assert body["frequency_penalty"] == 0.0
    assert body["top_logprobs"] == 0
    assert body["reasoning"] is None
    assert body["max_output_tokens"] is None
    assert body["store"] == False
    assert body["service_tier"] == "default"
    assert body["metadata"] == {}
    assert body["background"] == False
    assert body["safety_identifier"] is None
    assert body["prompt_cache_key"] is None
    assert body["max_tool_calls"] is None
 def test_responses_stream_schema_fields():
    """Verify streaming done-events have the sequence_number, output_index,
    and content_index fields added by this PR. Also verify the completed
    response includes the 24 new schema fields."""
    global server
    server.start()
    res = server.make_stream_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": "Book",
        "max_output_tokens": 8,
        "temperature": 0.8,
        "stream": True,
    })
    seen_seq_nums = []
    saw_output_text_done = False
    saw_content_part_done = False
    saw_output_item_done = False
    completed_response = None
    for data in res:
        if "sequence_number" in data:
            seen_seq_nums.append(data["sequence_number"])
        if data.get("type") == "response.output_text.done":
            saw_output_text_done = True
            assert "content_index" in data
            assert "output_index" in data
            assert "logprobs" in data
            assert isinstance(data["logprobs"], list)
        if data.get("type") == "response.content_part.done":
            saw_content_part_done = True
            assert "content_index" in data
            assert "output_index" in data
        if data.get("type") == "response.output_item.done":
            saw_output_item_done = True
            assert "output_index" in data
        if data.get("type") == "response.completed":
            completed_response = data["response"]
    # Must have seen all done-event types
    assert saw_output_text_done, "never received response.output_text.done"
    assert saw_content_part_done, "never received response.content_part.done"
    assert saw_output_item_done, "never received response.output_item.done"
    # sequence_number must be present on done events and monotonically increasing
    assert len(seen_seq_nums) >= 4, f"expected >= 4 sequenced events, got {len(seen_seq_nums)}"
    assert all(a < b for a, b in zip(seen_seq_nums, seen_seq_nums[1:])), "sequence_numbers not strictly increasing"
    # completed response must have the new schema fields with correct values
    assert completed_response is not None
    assert completed_response["metadata"] == {}
    assert completed_response["store"] == False
    assert completed_response["truncation"] == "disabled"
    assert completed_response["usage"]["output_tokens_details"]["reasoning_tokens"] == 0
 def test_responses_non_function_tool_skipped():
    """Non-function tool types must be silently skipped, producing a valid
    completion with no tools field in the converted chat request. Upstream
    rejects non-function types with 400; our code must return 200 and
    generate output as if no tools were provided."""
    global server
    server.start()
    res = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
        "tools": [
            {"type": "web_search"},
            {"type": "code_interpreter"},
        ],
    })
    assert res.status_code == 200
    assert res.body["status"] == "completed"
    # With all tools skipped, the model must still produce text output
    assert len(res.body["output"]) > 0
    assert len(res.body["output_text"]) > 0
 def test_responses_only_non_function_tools_same_as_no_tools():
    """When ALL tools are non-function types, they should all be filtered out
    and the result should be identical to a request with no tools at all.
    Compare token counts to confirm the tools field was truly empty."""
    global server
    server.start()
    no_tools = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    with_skipped_tools = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
        "tools": [
            {"type": "web_search"},
            {"type": "code_interpreter"},
            {"type": "file_search"},
        ],
    })
    assert no_tools.status_code == 200
    assert with_skipped_tools.status_code == 200
    # If tools were truly stripped, prompt token count must be identical
    assert with_skipped_tools.body["usage"]["input_tokens"] == no_tools.body["usage"]["input_tokens"]
 def test_responses_extra_keys_stripped():
    """Responses-only request keys (store, include, prompt_cache_key, etc.)
    must be stripped before forwarding to the chat completions handler.
    The completion must succeed and produce the same output as a request
    without those keys."""
    global server
    server.start()
    # Baseline without extra keys
    baseline = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert baseline.status_code == 200
    # Same request with extra Responses-only keys
    res = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
        "store": True,
        "include": ["usage"],
        "prompt_cache_key": "test_key",
        "web_search": {"enabled": True},
        "text": {"format": {"type": "text"}},
        "truncation": "auto",
        "metadata": {"key": "value"},
    })
    assert res.status_code == 200
    assert res.body["status"] == "completed"
    # Extra keys must not affect token consumption
    assert res.body["usage"]["input_tokens"] == baseline.body["usage"]["input_tokens"]
 def test_responses_developer_role_merging():
    """Developer role messages must be merged into the first system message
    at position 0. This ensures templates that require a single system
    message don't see developer content as a separate turn.
    We verify by comparing token counts: system + developer merged should
    consume the same prompt tokens as a single system message with the
    combined content."""
    global server
    server.start()
    # Single combined system message
    combined = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": [
                {"type": "input_text", "text": "Book"},
                {"type": "input_text", "text": "Keep it short"},
            ]},
            {"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert combined.status_code == 200
    # Split system + developer (should be merged to same prompt)
    split = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": [{"type": "input_text", "text": "Book"}]},
            {"role": "user", "content": [{"type": "input_text", "text": "What is the best book"}]},
            {"role": "developer", "content": [{"type": "input_text", "text": "Keep it short"}]},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert split.status_code == 200
    assert split.body["status"] == "completed"
    # Merged prompt should consume same number of input tokens
    assert split.body["usage"]["input_tokens"] == combined.body["usage"]["input_tokens"]
 def test_responses_input_text_type_multi_turn():
    """input_text type must be accepted for assistant messages (EasyInputMessage).
    An assistant message without explicit type:'message' must also be accepted
    (AssistantMessageItemParam). Verify the multi-turn context is preserved
    by checking the model sees the full conversation."""
    global server
    server.start()
    res = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "user", "content": [{"type": "input_text", "text": "Hello"}]},
            {
                "role": "assistant",
                "content": [{"type": "input_text", "text": "Hi there"}],
            },
            {"role": "user", "content": [{"type": "input_text", "text": "How are you"}]},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert res.status_code == 200
    assert res.body["status"] == "completed"
    # Multi-turn input should result in more prompt tokens than single-turn
    single = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": "How are you",
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert single.status_code == 200
    assert res.body["usage"]["input_tokens"] > single.body["usage"]["input_tokens"]
 def test_responses_output_text_matches_content():
    """output_text must be the concatenation of all output_text content parts.
    Verify this for both streaming and non-streaming responses."""
    global server
    server.start()
    # Non-streaming
    res = server.make_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
    })
    assert res.status_code == 200
    # Manually reconstruct output_text from content parts
    reconstructed = ""
    for item in res.body["output"]:
        if item.get("type") == "message":
            for part in item["content"]:
                if part.get("type") == "output_text":
                    reconstructed += part["text"]
    assert res.body["output_text"] == reconstructed
    assert len(reconstructed) > 0
 def test_responses_stream_output_text_consistency():
    """Streaming gathered text must match the output_text in response.completed."""
    global server
    server.start()
    res = server.make_stream_request("POST", "/v1/responses", data={
        "model": "gpt-4.1",
        "input": [
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        "max_output_tokens": 8,
        "temperature": 0.8,
        "stream": True,
    })
    gathered_text = ""
    completed_output_text = None
    for data in res:
        if data.get("type") == "response.output_text.delta":
            gathered_text += data["delta"]
        if data.get("type") == "response.completed":
            completed_output_text = data["response"]["output_text"]
            # Also verify content parts match
            for item in data["response"]["output"]:
                if item.get("type") == "message":
                    for part in item["content"]:
                        if part.get("type") == "output_text":
                            assert part["text"] == gathered_text
    assert completed_output_text is not None
    assert gathered_text == completed_output_text
    assert len(gathered_text) > 0