server : add thinking content blocks to Anthropic Messages API
Add support for returning reasoning/thinking content in Anthropic API responses when using models with --reasoning-format deepseek and the thinking parameter enabled. - Non-streaming: adds thinking block before text in content array - Streaming: emits thinking_delta events with correct block indices - Partial streaming: tracks reasoning state across chunks via anthropic_has_reasoning member variable Tested with bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF model.
This commit is contained in:
parent
f38de16341
commit
862b96c45d
|
|
@ -811,6 +811,14 @@ json server_task_result_cmpl_final::to_json_anthropic() {
|
||||||
msg.content = content;
|
msg.content = content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// thinking block comes first (Anthropic extended thinking format)
|
||||||
|
if (!msg.reasoning_content.empty()) {
|
||||||
|
content_blocks.push_back({
|
||||||
|
{"type", "thinking"},
|
||||||
|
{"thinking", msg.reasoning_content}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (!msg.content.empty()) {
|
if (!msg.content.empty()) {
|
||||||
content_blocks.push_back({
|
content_blocks.push_back({
|
||||||
{"type", "text"},
|
{"type", "text"},
|
||||||
|
|
@ -859,20 +867,57 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||||
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
|
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_text = !oaicompat_msg.content.empty();
|
bool has_thinking = !oaicompat_msg.reasoning_content.empty();
|
||||||
|
bool has_text = !oaicompat_msg.content.empty();
|
||||||
size_t num_tool_calls = oaicompat_msg.tool_calls.size();
|
size_t num_tool_calls = oaicompat_msg.tool_calls.size();
|
||||||
|
|
||||||
bool text_block_started = false;
|
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
|
||||||
|
size_t thinking_block_index = 0;
|
||||||
|
size_t text_block_index = has_thinking ? 1 : 0;
|
||||||
|
|
||||||
|
bool thinking_block_started = false;
|
||||||
|
bool text_block_started = false;
|
||||||
std::unordered_set<size_t> tool_calls_started;
|
std::unordered_set<size_t> tool_calls_started;
|
||||||
|
|
||||||
for (const auto & diff : oaicompat_msg_diffs) {
|
for (const auto & diff : oaicompat_msg_diffs) {
|
||||||
|
// handle thinking/reasoning content
|
||||||
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
|
if (!thinking_block_started) {
|
||||||
|
events.push_back({
|
||||||
|
{"event", "content_block_start"},
|
||||||
|
{"data", {
|
||||||
|
{"type", "content_block_start"},
|
||||||
|
{"index", thinking_block_index},
|
||||||
|
{"content_block", {
|
||||||
|
{"type", "thinking"},
|
||||||
|
{"thinking", ""}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
thinking_block_started = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
events.push_back({
|
||||||
|
{"event", "content_block_delta"},
|
||||||
|
{"data", {
|
||||||
|
{"type", "content_block_delta"},
|
||||||
|
{"index", thinking_block_index},
|
||||||
|
{"delta", {
|
||||||
|
{"type", "thinking_delta"},
|
||||||
|
{"thinking", diff.reasoning_content_delta}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle regular text content
|
||||||
if (!diff.content_delta.empty()) {
|
if (!diff.content_delta.empty()) {
|
||||||
if (!text_block_started) {
|
if (!text_block_started) {
|
||||||
events.push_back({
|
events.push_back({
|
||||||
{"event", "content_block_start"},
|
{"event", "content_block_start"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{"type", "content_block_start"},
|
{"type", "content_block_start"},
|
||||||
{"index", 0},
|
{"index", text_block_index},
|
||||||
{"content_block", {
|
{"content_block", {
|
||||||
{"type", "text"},
|
{"type", "text"},
|
||||||
{"text", ""}
|
{"text", ""}
|
||||||
|
|
@ -886,7 +931,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||||
{"event", "content_block_delta"},
|
{"event", "content_block_delta"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{"type", "content_block_delta"},
|
{"type", "content_block_delta"},
|
||||||
{"index", 0},
|
{"index", text_block_index},
|
||||||
{"delta", {
|
{"delta", {
|
||||||
{"type", "text_delta"},
|
{"type", "text_delta"},
|
||||||
{"text", diff.content_delta}
|
{"text", diff.content_delta}
|
||||||
|
|
@ -895,8 +940,9 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handle tool calls
|
||||||
if (diff.tool_call_index != std::string::npos) {
|
if (diff.tool_call_index != std::string::npos) {
|
||||||
size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index;
|
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
|
||||||
|
|
||||||
if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
|
if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
|
||||||
const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
|
const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
|
||||||
|
|
@ -932,18 +978,29 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// close content blocks in order
|
||||||
|
if (has_thinking) {
|
||||||
|
events.push_back({
|
||||||
|
{"event", "content_block_stop"},
|
||||||
|
{"data", {
|
||||||
|
{"type", "content_block_stop"},
|
||||||
|
{"index", thinking_block_index}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (has_text) {
|
if (has_text) {
|
||||||
events.push_back({
|
events.push_back({
|
||||||
{"event", "content_block_stop"},
|
{"event", "content_block_stop"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{"type", "content_block_stop"},
|
{"type", "content_block_stop"},
|
||||||
{"index", 0}
|
{"index", text_block_index}
|
||||||
}}
|
}}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < num_tool_calls; i++) {
|
for (size_t i = 0; i < num_tool_calls; i++) {
|
||||||
size_t content_block_index = (has_text ? 1 : 0) + i;
|
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
|
||||||
events.push_back({
|
events.push_back({
|
||||||
{"event", "content_block_stop"},
|
{"event", "content_block_stop"},
|
||||||
{"data", {
|
{"data", {
|
||||||
|
|
@ -1151,11 +1208,10 @@ json server_task_result_rerank::to_json() {
|
||||||
json server_task_result_cmpl_partial::to_json_anthropic() {
|
json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||||
json events = json::array();
|
json events = json::array();
|
||||||
bool first = (n_decoded == 1);
|
bool first = (n_decoded == 1);
|
||||||
bool text_block_started = false;
|
bool thinking_block_started = false;
|
||||||
|
bool text_block_started = false;
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
text_block_started = false;
|
|
||||||
|
|
||||||
events.push_back({
|
events.push_back({
|
||||||
{"event", "message_start"},
|
{"event", "message_start"},
|
||||||
{"data", {
|
{"data", {
|
||||||
|
|
@ -1177,14 +1233,50 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
|
||||||
|
size_t thinking_block_index = 0;
|
||||||
|
// use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
|
||||||
|
size_t text_block_index = anthropic_has_reasoning ? 1 : 0;
|
||||||
|
|
||||||
for (const auto & diff : oaicompat_msg_diffs) {
|
for (const auto & diff : oaicompat_msg_diffs) {
|
||||||
|
// handle thinking/reasoning content
|
||||||
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
|
if (!thinking_block_started) {
|
||||||
|
events.push_back({
|
||||||
|
{"event", "content_block_start"},
|
||||||
|
{"data", {
|
||||||
|
{"type", "content_block_start"},
|
||||||
|
{"index", thinking_block_index},
|
||||||
|
{"content_block", {
|
||||||
|
{"type", "thinking"},
|
||||||
|
{"thinking", ""}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
thinking_block_started = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
events.push_back({
|
||||||
|
{"event", "content_block_delta"},
|
||||||
|
{"data", {
|
||||||
|
{"type", "content_block_delta"},
|
||||||
|
{"index", thinking_block_index},
|
||||||
|
{"delta", {
|
||||||
|
{"type", "thinking_delta"},
|
||||||
|
{"thinking", diff.reasoning_content_delta}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle regular text content
|
||||||
if (!diff.content_delta.empty()) {
|
if (!diff.content_delta.empty()) {
|
||||||
if (!text_block_started) {
|
if (!text_block_started) {
|
||||||
events.push_back({
|
events.push_back({
|
||||||
{"event", "content_block_start"},
|
{"event", "content_block_start"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{"type", "content_block_start"},
|
{"type", "content_block_start"},
|
||||||
{"index", 0},
|
{"index", text_block_index},
|
||||||
{"content_block", {
|
{"content_block", {
|
||||||
{"type", "text"},
|
{"type", "text"},
|
||||||
{"text", ""}
|
{"text", ""}
|
||||||
|
|
@ -1198,7 +1290,7 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||||
{"event", "content_block_delta"},
|
{"event", "content_block_delta"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{"type", "content_block_delta"},
|
{"type", "content_block_delta"},
|
||||||
{"index", 0},
|
{"index", text_block_index},
|
||||||
{"delta", {
|
{"delta", {
|
||||||
{"type", "text_delta"},
|
{"type", "text_delta"},
|
||||||
{"text", diff.content_delta}
|
{"text", diff.content_delta}
|
||||||
|
|
@ -1207,8 +1299,10 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handle tool calls
|
||||||
if (diff.tool_call_index != std::string::npos) {
|
if (diff.tool_call_index != std::string::npos) {
|
||||||
size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index;
|
// use anthropic_has_reasoning for thinking block count (persists across calls)
|
||||||
|
size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_block_started ? 1 : 0) + diff.tool_call_index;
|
||||||
|
|
||||||
if (!diff.tool_call_delta.name.empty()) {
|
if (!diff.tool_call_delta.name.empty()) {
|
||||||
events.push_back({
|
events.push_back({
|
||||||
|
|
|
||||||
|
|
@ -337,6 +337,9 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||||
bool is_updated = false;
|
bool is_updated = false;
|
||||||
|
|
||||||
|
// for Anthropic API: track if any reasoning content has been generated
|
||||||
|
bool anthropic_has_reasoning = false;
|
||||||
|
|
||||||
virtual bool is_stop() override {
|
virtual bool is_stop() override {
|
||||||
return false; // in stream mode, partial responses are not considered stop
|
return false; // in stream mode, partial responses are not considered stop
|
||||||
}
|
}
|
||||||
|
|
@ -346,6 +349,8 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
virtual void update(task_result_state & state) override {
|
virtual void update(task_result_state & state) override {
|
||||||
is_updated = true;
|
is_updated = true;
|
||||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||||
|
// track if the accumulated message has any reasoning content
|
||||||
|
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
json to_json_non_oaicompat();
|
json to_json_non_oaicompat();
|
||||||
|
|
|
||||||
|
|
@ -805,3 +805,85 @@ def test_anthropic_vs_openai_different_response_format():
|
||||||
assert "input_tokens" in anthropic_res.body["usage"]
|
assert "input_tokens" in anthropic_res.body["usage"]
|
||||||
assert "completion_tokens" in openai_res.body["usage"]
|
assert "completion_tokens" in openai_res.body["usage"]
|
||||||
assert "output_tokens" in anthropic_res.body["usage"]
|
assert "output_tokens" in anthropic_res.body["usage"]
|
||||||
|
|
||||||
|
|
||||||
|
# Extended thinking tests with reasoning models
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.parametrize("stream", [False, True])
|
||||||
|
def test_anthropic_thinking_with_reasoning_model(stream):
|
||||||
|
"""Test that thinking content blocks are properly returned for reasoning models"""
|
||||||
|
global server
|
||||||
|
server = ServerProcess()
|
||||||
|
server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
|
||||||
|
server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
|
||||||
|
server.reasoning_format = "deepseek"
|
||||||
|
server.jinja = True
|
||||||
|
server.n_ctx = 8192
|
||||||
|
server.n_predict = 1024
|
||||||
|
server.server_port = 8084
|
||||||
|
server.start(timeout_seconds=600) # large model needs time to download
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
res = server.make_stream_request("POST", "/v1/messages", data={
|
||||||
|
"model": "test",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"thinking": {
|
||||||
|
"type": "enabled",
|
||||||
|
"budget_tokens": 500
|
||||||
|
},
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What is 2+2?"}
|
||||||
|
],
|
||||||
|
"stream": True
|
||||||
|
})
|
||||||
|
|
||||||
|
events = list(res)
|
||||||
|
|
||||||
|
# should have thinking content block events
|
||||||
|
thinking_starts = [e for e in events if
|
||||||
|
e.get("type") == "content_block_start" and
|
||||||
|
e.get("content_block", {}).get("type") == "thinking"]
|
||||||
|
assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
|
||||||
|
assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
|
||||||
|
|
||||||
|
# should have thinking_delta events
|
||||||
|
thinking_deltas = [e for e in events if
|
||||||
|
e.get("type") == "content_block_delta" and
|
||||||
|
e.get("delta", {}).get("type") == "thinking_delta"]
|
||||||
|
assert len(thinking_deltas) > 0, "Should have thinking_delta events"
|
||||||
|
|
||||||
|
# should have text block after thinking
|
||||||
|
text_starts = [e for e in events if
|
||||||
|
e.get("type") == "content_block_start" and
|
||||||
|
e.get("content_block", {}).get("type") == "text"]
|
||||||
|
assert len(text_starts) > 0, "Should have text content_block_start event"
|
||||||
|
assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
|
||||||
|
else:
|
||||||
|
res = server.make_request("POST", "/v1/messages", data={
|
||||||
|
"model": "test",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"thinking": {
|
||||||
|
"type": "enabled",
|
||||||
|
"budget_tokens": 500
|
||||||
|
},
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What is 2+2?"}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert res.body["type"] == "message"
|
||||||
|
|
||||||
|
content = res.body["content"]
|
||||||
|
assert len(content) >= 2, "Should have at least thinking and text blocks"
|
||||||
|
|
||||||
|
# first block should be thinking
|
||||||
|
thinking_blocks = [b for b in content if b.get("type") == "thinking"]
|
||||||
|
assert len(thinking_blocks) > 0, "Should have thinking content block"
|
||||||
|
assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
|
||||||
|
assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
|
||||||
|
|
||||||
|
# should also have text block
|
||||||
|
text_blocks = [b for b in content if b.get("type") == "text"]
|
||||||
|
assert len(text_blocks) > 0, "Should have text content block"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue