feat(server): add cache_read_input_tokens to usage metrics and response structure

- Added `n_cache_read_input_tokens` field to `server_task_result_cmpl_final` and `server_task_result_cmpl_partial` structs
- Populated `cache_read_input_tokens` in JSON output for both final and streaming responses
- Ensured `cache_read_input_tokens` is non-negative by clamping to zero if negative
- Updated unit tests to validate presence, type, and non-negativity of `cache_read_input_tokens` in usage metrics
This commit is contained in:
Matthieu Beaumont 2026-02-12 18:52:22 +01:00
parent 4b385bfcf8
commit 0252fd253b
4 changed files with 16 additions and 0 deletions

View File

@ -1415,6 +1415,7 @@ private:
res->n_decoded = slot.n_decoded; res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens(); res->n_prompt_tokens = slot.task->n_tokens();
res->n_cache_read_input_tokens = slot.n_prompt_tokens_cache;
res->post_sampling_probs = slot.task->params.post_sampling_probs; res->post_sampling_probs = slot.task->params.post_sampling_probs;
res->verbose = slot.task->params.verbose; res->verbose = slot.task->params.verbose;
@ -1457,6 +1458,7 @@ private:
res->truncated = slot.truncated; res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded; res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.task->n_tokens(); res->n_prompt_tokens = slot.task->n_tokens();
res->n_cache_read_input_tokens = slot.n_prompt_tokens_cache;
res->n_tokens_cached = slot.prompt.n_tokens(); res->n_tokens_cached = slot.prompt.n_tokens();
res->has_new_line = slot.has_new_line; res->has_new_line = slot.has_new_line;
res->stopping_word = slot.stopping_word; res->stopping_word = slot.stopping_word;

View File

@ -1010,6 +1010,7 @@ json server_task_result_cmpl_final::to_json_anthropic() {
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
} }
const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0;
json content_blocks = json::array(); json content_blocks = json::array();
@ -1063,6 +1064,7 @@ json server_task_result_cmpl_final::to_json_anthropic() {
{"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}, {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
{"usage", { {"usage", {
{"input_tokens", n_prompt_tokens}, {"input_tokens", n_prompt_tokens},
{"cache_read_input_tokens", cache_read_input_tokens},
{"output_tokens", n_decoded} {"output_tokens", n_decoded}
}} }}
}; };
@ -1077,6 +1079,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
} }
const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0;
bool has_thinking = !oaicompat_msg.reasoning_content.empty(); bool has_thinking = !oaicompat_msg.reasoning_content.empty();
bool has_text = !oaicompat_msg.content.empty(); bool has_text = !oaicompat_msg.content.empty();
@ -1243,6 +1246,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
{"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)} {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
}}, }},
{"usage", { {"usage", {
{"cache_read_input_tokens", cache_read_input_tokens},
{"output_tokens", n_decoded} {"output_tokens", n_decoded}
}} }}
}} }}
@ -1555,6 +1559,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
json server_task_result_cmpl_partial::to_json_anthropic() { json server_task_result_cmpl_partial::to_json_anthropic() {
json events = json::array(); json events = json::array();
bool first = (n_decoded == 1); bool first = (n_decoded == 1);
const int32_t cache_read_input_tokens = n_cache_read_input_tokens >= 0 ? n_cache_read_input_tokens : 0;
// use member variables to track block state across streaming calls // use member variables to track block state across streaming calls
// (anthropic_thinking_block_started, anthropic_text_block_started) // (anthropic_thinking_block_started, anthropic_text_block_started)
@ -1573,6 +1578,7 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
{"stop_sequence", nullptr}, {"stop_sequence", nullptr},
{"usage", { {"usage", {
{"input_tokens", n_prompt_tokens}, {"input_tokens", n_prompt_tokens},
{"cache_read_input_tokens", cache_read_input_tokens},
{"output_tokens", 0} {"output_tokens", 0}
}} }}
}} }}

View File

@ -342,6 +342,7 @@ struct server_task_result_cmpl_final : server_task_result {
bool truncated; bool truncated;
int32_t n_decoded; int32_t n_decoded;
int32_t n_prompt_tokens; int32_t n_prompt_tokens;
int32_t n_cache_read_input_tokens = 0;
int32_t n_tokens_cached; int32_t n_tokens_cached;
bool has_new_line; bool has_new_line;
std::string stopping_word; std::string stopping_word;
@ -406,6 +407,7 @@ struct server_task_result_cmpl_partial : server_task_result {
int32_t n_decoded; int32_t n_decoded;
int32_t n_prompt_tokens; int32_t n_prompt_tokens;
int32_t n_cache_read_input_tokens = 0;
bool post_sampling_probs; bool post_sampling_probs;
bool is_progress = false; bool is_progress = false;

View File

@ -64,9 +64,12 @@ def test_anthropic_messages_basic():
assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}" assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}"
assert "usage" in res.body, "Missing 'usage' field" assert "usage" in res.body, "Missing 'usage' field"
assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens" assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens"
assert "cache_read_input_tokens" in res.body["usage"], "Missing usage.cache_read_input_tokens"
assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens" assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens"
assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer" assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer"
assert isinstance(res.body["usage"]["cache_read_input_tokens"], int), "cache_read_input_tokens should be integer"
assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer" assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer"
assert res.body["usage"]["cache_read_input_tokens"] >= 0, "cache_read_input_tokens should be >= 0"
assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens" assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens"
# Anthropic API should NOT include timings # Anthropic API should NOT include timings
assert "timings" not in res.body, "Anthropic API should not include timings field" assert "timings" not in res.body, "Anthropic API should not include timings field"
@ -168,6 +171,9 @@ def test_anthropic_messages_streaming():
assert message_start["message"]["content"] == [] assert message_start["message"]["content"] == []
assert "usage" in message_start["message"] assert "usage" in message_start["message"]
assert message_start["message"]["usage"]["input_tokens"] > 0 assert message_start["message"]["usage"]["input_tokens"] > 0
assert "cache_read_input_tokens" in message_start["message"]["usage"]
assert isinstance(message_start["message"]["usage"]["cache_read_input_tokens"], int)
assert message_start["message"]["usage"]["cache_read_input_tokens"] >= 0
# Check content_block_start # Check content_block_start
block_start = next(e for e in events if e["type"] == "content_block_start") block_start = next(e for e in events if e["type"] == "content_block_start")