From 0252fd253b27ffa3fa90713eeb9b77a1d906fdef Mon Sep 17 00:00:00 2001 From: Matthieu Beaumont Date: Thu, 12 Feb 2026 18:52:22 +0100 Subject: [PATCH] feat(server): add cache_read_input_tokens to usage metrics and response structure - Added `n_cache_read_input_tokens` field to `server_task_result_cmpl_final` and `server_task_result_cmpl_partial` structs - Populated `cache_read_input_tokens` in JSON output for both final and streaming responses - Ensured `cache_read_input_tokens` is non-negative by clamping to zero if negative - Updated unit tests to validate presence, type, and non-negativity of `cache_read_input_tokens` in usage metrics --- tools/server/server-context.cpp | 2 ++ tools/server/server-task.cpp | 6 ++++++ tools/server/server-task.h | 2 ++ tools/server/tests/unit/test_compat_anthropic.py | 6 ++++++ 4 files changed, 16 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ceafcac179..7c63752da8 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1415,6 +1415,7 @@ private: res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.task->n_tokens(); + res->n_cache_read_input_tokens = slot.n_prompt_tokens_cache; res->post_sampling_probs = slot.task->params.post_sampling_probs; res->verbose = slot.task->params.verbose; @@ -1457,6 +1458,7 @@ private: res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.task->n_tokens(); + res->n_cache_read_input_tokens = slot.n_prompt_tokens_cache; res->n_tokens_cached = slot.prompt.n_tokens(); res->has_new_line = slot.has_new_line; res->stopping_word = slot.stopping_word; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index a137427c69..93cf5bfd80 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1010,6 +1010,7 @@ json server_task_result_cmpl_final::to_json_anthropic() { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; } + const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0; json content_blocks = json::array(); @@ -1063,6 +1064,7 @@ json server_task_result_cmpl_final::to_json_anthropic() { {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}, {"usage", { {"input_tokens", n_prompt_tokens}, + {"cache_read_input_tokens", cache_read_input_tokens}, {"output_tokens", n_decoded} }} }; @@ -1077,6 +1079,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; } + const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0; bool has_thinking = !oaicompat_msg.reasoning_content.empty(); bool has_text = !oaicompat_msg.content.empty(); @@ -1243,6 +1246,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() { {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)} }}, {"usage", { + {"cache_read_input_tokens", cache_read_input_tokens}, {"output_tokens", n_decoded} }} }} @@ -1555,6 +1559,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { json server_task_result_cmpl_partial::to_json_anthropic() { json events = json::array(); bool first = (n_decoded == 1); + const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0; // use member variables to track block state across streaming calls // (anthropic_thinking_block_started, anthropic_text_block_started) @@ -1573,6 +1578,7 @@ json server_task_result_cmpl_partial::to_json_anthropic() { {"stop_sequence", nullptr}, {"usage", { {"input_tokens", n_prompt_tokens}, + {"cache_read_input_tokens", cache_read_input_tokens}, {"output_tokens", 0} }} }} diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a69e8f1a3d..1129b679eb 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -342,6 +342,7 @@ struct server_task_result_cmpl_final : server_task_result { bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; + int32_t n_cache_read_input_tokens = 0; int32_t n_tokens_cached; bool has_new_line; std::string stopping_word; @@ -406,6 +407,7 @@ struct server_task_result_cmpl_partial : server_task_result { int32_t n_decoded; int32_t n_prompt_tokens; + int32_t n_cache_read_input_tokens = 0; bool post_sampling_probs; bool is_progress = false; diff --git a/tools/server/tests/unit/test_compat_anthropic.py b/tools/server/tests/unit/test_compat_anthropic.py index e16e0235c6..8593e68bd9 100644 --- a/tools/server/tests/unit/test_compat_anthropic.py +++ b/tools/server/tests/unit/test_compat_anthropic.py @@ -64,9 +64,12 @@ def test_anthropic_messages_basic(): assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}" assert "usage" in res.body, "Missing 'usage' field" assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens" + assert "cache_read_input_tokens" in res.body["usage"], "Missing usage.cache_read_input_tokens" assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens" assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer" + assert isinstance(res.body["usage"]["cache_read_input_tokens"], int), "cache_read_input_tokens should be integer" assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer" + assert res.body["usage"]["cache_read_input_tokens"] >= 0, "cache_read_input_tokens should be >= 0" assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens" # Anthropic API should NOT include timings assert "timings" not in res.body, "Anthropic API should not include timings field" @@ -168,6 +171,9 @@ def test_anthropic_messages_streaming(): assert message_start["message"]["content"] == [] assert "usage" in message_start["message"] assert message_start["message"]["usage"]["input_tokens"] > 0 + assert "cache_read_input_tokens" in message_start["message"]["usage"] + assert isinstance(message_start["message"]["usage"]["cache_read_input_tokens"], int) + assert message_start["message"]["usage"]["cache_read_input_tokens"] >= 0 # Check content_block_start block_start = next(e for e in events if e["type"] == "content_block_start")