server : support preserving reasoning_content in assistant message (#18994)
* support reasoning_content input * report template caps to webui * add docs * rm commented code
This commit is contained in:
parent
a5eaa1d6a3
commit
51fa458a92
|
|
@ -1630,7 +1630,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||||
}
|
}
|
||||||
auto msg = builder.result();
|
auto msg = builder.result();
|
||||||
if (!is_partial) {
|
if (!is_partial) {
|
||||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
|
||||||
}
|
}
|
||||||
return msg;
|
return msg;
|
||||||
}
|
}
|
||||||
|
|
@ -1663,7 +1663,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std
|
||||||
mapper.from_ast(ctx.ast, result);
|
mapper.from_ast(ctx.ast, result);
|
||||||
}
|
}
|
||||||
if (!is_partial) {
|
if (!is_partial) {
|
||||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
|
||||||
}
|
}
|
||||||
return msg;
|
return msg;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
175
common/chat.cpp
175
common/chat.cpp
|
|
@ -7,9 +7,6 @@
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "regex-partial.h"
|
#include "regex-partial.h"
|
||||||
|
|
||||||
// #include <minja/chat-template.hpp>
|
|
||||||
// #include <minja/minja.hpp>
|
|
||||||
|
|
||||||
#include "jinja/parser.h"
|
#include "jinja/parser.h"
|
||||||
#include "jinja/value.h"
|
#include "jinja/value.h"
|
||||||
#include "jinja/runtime.h"
|
#include "jinja/runtime.h"
|
||||||
|
|
@ -56,39 +53,73 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
|
||||||
return !msg.content.empty() || !msg.tool_calls.empty();
|
return !msg.content.empty() || !msg.tool_calls.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||||
json common_chat_msg::to_json_oaicompat() const
|
if (!content.empty() && !content_parts.empty()) {
|
||||||
{
|
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||||
json message {
|
|
||||||
{"role", "assistant"},
|
|
||||||
};
|
|
||||||
if (!reasoning_content.empty()) {
|
|
||||||
message["reasoning_content"] = reasoning_content;
|
|
||||||
}
|
}
|
||||||
if (content.empty() && !tool_calls.empty()) {
|
json jmsg {
|
||||||
message["content"] = json();
|
{"role", role},
|
||||||
|
};
|
||||||
|
if (!content.empty()) {
|
||||||
|
jmsg["content"] = content;
|
||||||
|
} else if (!content_parts.empty()) {
|
||||||
|
if (concat_typed_text) {
|
||||||
|
std::string text;
|
||||||
|
for (const auto & part : content_parts) {
|
||||||
|
if (part.type != "text") {
|
||||||
|
LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!text.empty()) {
|
||||||
|
text += '\n';
|
||||||
|
}
|
||||||
|
text += part.text;
|
||||||
|
}
|
||||||
|
jmsg["content"] = text;
|
||||||
|
} else {
|
||||||
|
auto & parts = jmsg["content"] = json::array();
|
||||||
|
for (const auto & part : content_parts) {
|
||||||
|
parts.push_back({
|
||||||
|
{"type", part.type},
|
||||||
|
{"text", part.text},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
message["content"] = content;
|
jmsg["content"] = "";
|
||||||
|
}
|
||||||
|
if (!reasoning_content.empty()) {
|
||||||
|
jmsg["reasoning_content"] = reasoning_content;
|
||||||
|
}
|
||||||
|
if (!tool_name.empty()) {
|
||||||
|
jmsg["name"] = tool_name;
|
||||||
|
}
|
||||||
|
if (!tool_call_id.empty()) {
|
||||||
|
jmsg["tool_call_id"] = tool_call_id;
|
||||||
}
|
}
|
||||||
if (!tool_calls.empty()) {
|
if (!tool_calls.empty()) {
|
||||||
auto arr = json::array();
|
jmsg["tool_calls"] = json::array();
|
||||||
for (const auto & tc : tool_calls) {
|
auto & jtool_calls = jmsg["tool_calls"];
|
||||||
arr.push_back({
|
for (const auto & tool_call : tool_calls) {
|
||||||
|
json tc {
|
||||||
{"type", "function"},
|
{"type", "function"},
|
||||||
{"function", {
|
{"function", {
|
||||||
{"name", tc.name},
|
{"name", tool_call.name},
|
||||||
{"arguments", tc.arguments},
|
{"arguments", tool_call.arguments},
|
||||||
}},
|
}},
|
||||||
{"id", tc.id},
|
};
|
||||||
// // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
if (!tool_call.id.empty()) {
|
||||||
// // We only generate a random id for the ones that don't generate one by themselves
|
tc["id"] = tool_call.id;
|
||||||
// // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
}
|
||||||
// {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
||||||
});
|
// We only generate a random id for the ones that don't generate one by themselves
|
||||||
|
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
||||||
|
// {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
||||||
|
jtool_calls.push_back(tc);
|
||||||
}
|
}
|
||||||
message["tool_calls"] = arr;
|
|
||||||
}
|
}
|
||||||
return message;
|
|
||||||
|
return jmsg;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
|
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
|
||||||
|
|
@ -256,7 +287,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
|
||||||
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
||||||
std::vector<common_chat_msg> msgs;
|
std::vector<common_chat_msg> msgs;
|
||||||
|
|
||||||
|
|
@ -350,80 +380,15 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
return msgs;
|
return msgs;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
|
json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
|
||||||
json messages = json::array();
|
json messages = json::array();
|
||||||
for (const auto & msg : msgs) {
|
for (const auto & msg : msgs) {
|
||||||
if (!msg.content.empty() && !msg.content_parts.empty()) {
|
json jmsg = msg.to_json_oaicompat(concat_typed_text);
|
||||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
|
||||||
}
|
|
||||||
json jmsg {
|
|
||||||
{"role", msg.role},
|
|
||||||
};
|
|
||||||
if (!msg.content.empty()) {
|
|
||||||
jmsg["content"] = msg.content;
|
|
||||||
} else if (!msg.content_parts.empty()) {
|
|
||||||
if (concat_typed_text) {
|
|
||||||
std::string text;
|
|
||||||
for (const auto & part : msg.content_parts) {
|
|
||||||
if (part.type != "text") {
|
|
||||||
LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!text.empty()) {
|
|
||||||
text += '\n';
|
|
||||||
}
|
|
||||||
text += part.text;
|
|
||||||
}
|
|
||||||
jmsg["content"] = text;
|
|
||||||
} else {
|
|
||||||
auto & parts = jmsg["content"] = json::array();
|
|
||||||
for (const auto & part : msg.content_parts) {
|
|
||||||
parts.push_back({
|
|
||||||
{"type", part.type},
|
|
||||||
{"text", part.text},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
jmsg["content"] = "";
|
|
||||||
}
|
|
||||||
if (!msg.reasoning_content.empty()) {
|
|
||||||
jmsg["reasoning_content"] = msg.reasoning_content;
|
|
||||||
}
|
|
||||||
if (!msg.tool_name.empty()) {
|
|
||||||
jmsg["name"] = msg.tool_name;
|
|
||||||
}
|
|
||||||
if (!msg.tool_call_id.empty()) {
|
|
||||||
jmsg["tool_call_id"] = msg.tool_call_id;
|
|
||||||
}
|
|
||||||
if (!msg.tool_calls.empty()) {
|
|
||||||
auto & tool_calls = jmsg["tool_calls"] = json::array();
|
|
||||||
for (const auto & tool_call : msg.tool_calls) {
|
|
||||||
json tc {
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"name", tool_call.name},
|
|
||||||
{"arguments", tool_call.arguments},
|
|
||||||
}},
|
|
||||||
};
|
|
||||||
if (!tool_call.id.empty()) {
|
|
||||||
tc["id"] = tool_call.id;
|
|
||||||
}
|
|
||||||
tool_calls.push_back(tc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
messages.push_back(jmsg);
|
messages.push_back(jmsg);
|
||||||
}
|
}
|
||||||
return messages;
|
return messages;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const std::string & messages) {
|
|
||||||
return common_chat_msgs_parse_oaicompat(json::parse(messages));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
|
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
|
||||||
std::vector<common_chat_tool> result;
|
std::vector<common_chat_tool> result;
|
||||||
|
|
||||||
|
|
@ -459,12 +424,6 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const std::string & tools) {
|
|
||||||
return common_chat_tools_parse_oaicompat(json::parse(tools));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
||||||
if (tools.empty()) {
|
if (tools.empty()) {
|
||||||
return json();
|
return json();
|
||||||
|
|
@ -484,7 +443,7 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
||||||
json delta = json::object();
|
json delta = json::object();
|
||||||
if (!diff.reasoning_content_delta.empty()) {
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
delta["reasoning_content"] = diff.reasoning_content_delta;
|
delta["reasoning_content"] = diff.reasoning_content_delta;
|
||||||
|
|
@ -2867,13 +2826,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
const struct common_chat_templates_inputs & inputs)
|
const struct common_chat_templates_inputs & inputs)
|
||||||
{
|
{
|
||||||
templates_params params;
|
templates_params params;
|
||||||
params.tools = common_chat_tools_to_json_oaicompat<json>(inputs.tools);
|
params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
|
||||||
const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
|
const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
|
||||||
? *tmpls->template_tool_use
|
? *tmpls->template_tool_use
|
||||||
: *tmpls->template_default;
|
: *tmpls->template_default;
|
||||||
const auto & src = tmpl.source();
|
const auto & src = tmpl.source();
|
||||||
const auto & caps = tmpl.original_caps();
|
const auto & caps = tmpl.original_caps();
|
||||||
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
||||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||||
params.tool_choice = inputs.tool_choice;
|
params.tool_choice = inputs.tool_choice;
|
||||||
params.reasoning_format = inputs.reasoning_format;
|
params.reasoning_format = inputs.reasoning_format;
|
||||||
|
|
@ -2943,6 +2902,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
src.find("<arg_value>") != std::string::npos &&
|
src.find("<arg_value>") != std::string::npos &&
|
||||||
params.json_schema.is_null()) {
|
params.json_schema.is_null()) {
|
||||||
workaround::func_args_not_string(params.messages);
|
workaround::func_args_not_string(params.messages);
|
||||||
|
if (!params.extra_context.contains("clear_thinking")) {
|
||||||
|
// by default, do not clear reasoning_content (added since GLM-4.7)
|
||||||
|
params.extra_context["clear_thinking"] = false;
|
||||||
|
}
|
||||||
return common_chat_params_init_glm_4_5(tmpl, params);
|
return common_chat_params_init_glm_4_5(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3174,3 +3137,9 @@ common_chat_params common_chat_templates_apply(
|
||||||
? common_chat_templates_apply_jinja(tmpls, inputs)
|
? common_chat_templates_apply_jinja(tmpls, inputs)
|
||||||
: common_chat_templates_apply_legacy(tmpls, inputs);
|
: common_chat_templates_apply_legacy(tmpls, inputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
|
||||||
|
GGML_ASSERT(chat_templates != nullptr);
|
||||||
|
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||||
|
return chat_templates->template_default->caps.to_map();
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
|
#include <nlohmann/json_fwd.hpp>
|
||||||
|
|
||||||
struct common_chat_templates;
|
struct common_chat_templates;
|
||||||
|
|
||||||
struct common_chat_tool_call {
|
struct common_chat_tool_call {
|
||||||
|
|
@ -26,6 +28,11 @@ struct common_chat_msg_content_part {
|
||||||
std::string type;
|
std::string type;
|
||||||
std::string text;
|
std::string text;
|
||||||
|
|
||||||
|
// TODO @ngxson : no known chat templates support reasoning_content in content parts yet
|
||||||
|
// this can be useful for models with interleaved thinking (like Kimi-K2)
|
||||||
|
// if you see any templates explicitly support this, please ping me
|
||||||
|
// std::string reasoning_content;
|
||||||
|
|
||||||
bool operator==(const common_chat_msg_content_part & other) const {
|
bool operator==(const common_chat_msg_content_part & other) const {
|
||||||
return type == other.type && text == other.text;
|
return type == other.type && text == other.text;
|
||||||
}
|
}
|
||||||
|
|
@ -40,7 +47,7 @@ struct common_chat_msg {
|
||||||
std::string tool_name;
|
std::string tool_name;
|
||||||
std::string tool_call_id;
|
std::string tool_call_id;
|
||||||
|
|
||||||
template <class T> T to_json_oaicompat() const;
|
nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
|
||||||
|
|
||||||
bool empty() const {
|
bool empty() const {
|
||||||
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
||||||
|
|
@ -232,13 +239,13 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
||||||
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
||||||
|
|
||||||
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
||||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
|
||||||
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
|
||||||
template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
|
|
||||||
|
|
||||||
// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
|
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||||
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
|
||||||
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
|
||||||
|
|
||||||
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||||
|
|
||||||
|
// get template caps, useful for reporting to server /props endpoint
|
||||||
|
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
|
||||||
|
|
|
||||||
|
|
@ -61,14 +61,23 @@ static void caps_print_stats(value & v, const std::string & path) {
|
||||||
ops.c_str());
|
ops.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::map<std::string, bool> caps::to_map() const {
|
||||||
|
return {
|
||||||
|
{"requires_typed_content", requires_typed_content},
|
||||||
|
{"supports_tools", supports_tools},
|
||||||
|
{"supports_tool_calls", supports_tool_calls},
|
||||||
|
{"supports_parallel_tool_calls", supports_parallel_tool_calls},
|
||||||
|
{"supports_system_role", supports_system_role},
|
||||||
|
{"supports_preserve_reasoning", supports_preserve_reasoning},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
std::string caps::to_string() const {
|
std::string caps::to_string() const {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
ss << "Caps(\n";
|
ss << "Caps(\n";
|
||||||
ss << " requires_typed_content=" << requires_typed_content << "\n";
|
for (const auto & [key, value] : to_map()) {
|
||||||
ss << " supports_tools=" << supports_tools << "\n";
|
ss << " " << key << "=" << (value ? "true" : "false") << "\n";
|
||||||
ss << " supports_tool_calls=" << supports_tool_calls << "\n";
|
}
|
||||||
ss << " supports_parallel_tool_calls=" << supports_parallel_tool_calls << "\n";
|
|
||||||
ss << " supports_system_role=" << supports_system_role << "\n";
|
|
||||||
ss << ")";
|
ss << ")";
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
@ -229,6 +238,40 @@ caps caps_get(jinja::program & prog) {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// case: preserve reasoning content in chat history
|
||||||
|
caps_try_execute(
|
||||||
|
prog,
|
||||||
|
[&]() {
|
||||||
|
// messages
|
||||||
|
return json::array({
|
||||||
|
{
|
||||||
|
{"role", "user"},
|
||||||
|
{"content", "User message"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
{"role", "assistant"},
|
||||||
|
{"content", "Assistant message"},
|
||||||
|
{"reasoning_content", "Reasoning content"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
{"role", "user"},
|
||||||
|
{"content", "User message"}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
},
|
||||||
|
[&]() {
|
||||||
|
// tools
|
||||||
|
return json::array();
|
||||||
|
},
|
||||||
|
[&](bool, value & messages, value &) {
|
||||||
|
auto & content = messages->at(1)->at("reasoning_content");
|
||||||
|
caps_print_stats(content, "messages[1].reasoning_content");
|
||||||
|
if (content->stats.used) {
|
||||||
|
result.supports_preserve_reasoning = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
JJ_DEBUG("%s\n", result.to_string().c_str());
|
JJ_DEBUG("%s\n", result.to_string().c_str());
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
#include "runtime.h"
|
#include "runtime.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
namespace jinja {
|
namespace jinja {
|
||||||
|
|
||||||
|
|
@ -11,14 +12,17 @@ struct caps {
|
||||||
bool supports_tool_calls = true;
|
bool supports_tool_calls = true;
|
||||||
bool supports_system_role = true;
|
bool supports_system_role = true;
|
||||||
bool supports_parallel_tool_calls = true;
|
bool supports_parallel_tool_calls = true;
|
||||||
|
bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
|
||||||
|
|
||||||
bool requires_typed_content = false; // default: use string content
|
bool requires_typed_content = false; // default: use string content
|
||||||
|
|
||||||
|
// for reporting on server
|
||||||
|
std::map<std::string, bool> to_map() const;
|
||||||
|
|
||||||
// for debugging
|
// for debugging
|
||||||
std::string to_string() const;
|
std::string to_string() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
caps caps_get(jinja::program & prog);
|
caps caps_get(jinja::program & prog);
|
||||||
void debug_print_caps(const caps & c);
|
|
||||||
|
|
||||||
} // namespace jinja
|
} // namespace jinja
|
||||||
|
|
|
||||||
|
|
@ -462,9 +462,9 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s
|
||||||
for (size_t i = 1; i <= raw_message.size(); ++i) {
|
for (size_t i = 1; i <= raw_message.size(); ++i) {
|
||||||
auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
|
auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
|
||||||
if (curr_msg == simple_assist_msg("")) continue;
|
if (curr_msg == simple_assist_msg("")) continue;
|
||||||
LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat<json>({curr_msg}).dump().c_str());
|
LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str());
|
||||||
for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
|
for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
|
||||||
LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat<json>(diff).dump().c_str());
|
LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str());
|
||||||
if (!diff.reasoning_content_delta.empty()) {
|
if (!diff.reasoning_content_delta.empty()) {
|
||||||
merged.reasoning_content += diff.reasoning_content_delta;
|
merged.reasoning_content += diff.reasoning_content_delta;
|
||||||
}
|
}
|
||||||
|
|
@ -480,7 +480,7 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s
|
||||||
merged.tool_calls.back().arguments += diff.tool_call_delta.arguments;
|
merged.tool_calls.back().arguments += diff.tool_call_delta.arguments;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat<json>({merged}).dump().c_str());
|
LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str());
|
||||||
}
|
}
|
||||||
assert_msg_equals(curr_msg, merged, true);
|
assert_msg_equals(curr_msg, merged, true);
|
||||||
last_msg = curr_msg;
|
last_msg = curr_msg;
|
||||||
|
|
@ -622,7 +622,7 @@ static void test_msgs_oaicompat_json_conversion() {
|
||||||
message_assist_call_code_interpreter,
|
message_assist_call_code_interpreter,
|
||||||
};
|
};
|
||||||
for (const auto & msg : msgs) {
|
for (const auto & msg : msgs) {
|
||||||
auto oai_json = common_chat_msgs_to_json_oaicompat<json>({msg});
|
auto oai_json = common_chat_msgs_to_json_oaicompat({msg});
|
||||||
auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json);
|
auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json);
|
||||||
assert_equals((size_t) 1, msgs2.size());
|
assert_equals((size_t) 1, msgs2.size());
|
||||||
auto msg2 = msgs2[0];
|
auto msg2 = msgs2[0];
|
||||||
|
|
@ -646,7 +646,7 @@ static void test_msgs_oaicompat_json_conversion() {
|
||||||
" }\n"
|
" }\n"
|
||||||
"]"
|
"]"
|
||||||
),
|
),
|
||||||
common_chat_msgs_to_json_oaicompat<json>({message_user_parts}).dump(2));
|
common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2));
|
||||||
|
|
||||||
assert_equals(
|
assert_equals(
|
||||||
std::string(
|
std::string(
|
||||||
|
|
@ -666,7 +666,7 @@ static void test_msgs_oaicompat_json_conversion() {
|
||||||
" }\n"
|
" }\n"
|
||||||
"]"
|
"]"
|
||||||
),
|
),
|
||||||
common_chat_msgs_to_json_oaicompat<json>({message_assist_call_python}).dump(2));
|
common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2));
|
||||||
|
|
||||||
auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]"));
|
auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]"));
|
||||||
assert_equals<size_t>(1, res.size());
|
assert_equals<size_t>(1, res.size());
|
||||||
|
|
@ -693,7 +693,7 @@ static void test_tools_oaicompat_json_conversion() {
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto & tool : tools) {
|
for (const auto & tool : tools) {
|
||||||
auto oai_json = common_chat_tools_to_json_oaicompat<json>({tool});
|
auto oai_json = common_chat_tools_to_json_oaicompat({tool});
|
||||||
auto tools2 = common_chat_tools_parse_oaicompat(oai_json);
|
auto tools2 = common_chat_tools_parse_oaicompat(oai_json);
|
||||||
assert_equals((size_t) 1, tools2.size());
|
assert_equals((size_t) 1, tools2.size());
|
||||||
auto tool2 = tools2[0];
|
auto tool2 = tools2[0];
|
||||||
|
|
@ -726,7 +726,7 @@ static void test_tools_oaicompat_json_conversion() {
|
||||||
" }\n"
|
" }\n"
|
||||||
"]"
|
"]"
|
||||||
),
|
),
|
||||||
common_chat_tools_to_json_oaicompat<json>({special_function_tool}).dump(2));
|
common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2));
|
||||||
|
|
||||||
{
|
{
|
||||||
auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse(
|
auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse(
|
||||||
|
|
|
||||||
|
|
@ -781,6 +781,7 @@ By default, it is read-only. To make POST request to change global properties, y
|
||||||
"total_slots": 1,
|
"total_slots": 1,
|
||||||
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||||
"chat_template": "...",
|
"chat_template": "...",
|
||||||
|
"chat_template_caps": {},
|
||||||
"modalities": {
|
"modalities": {
|
||||||
"vision": false
|
"vision": false
|
||||||
},
|
},
|
||||||
|
|
@ -793,6 +794,7 @@ By default, it is read-only. To make POST request to change global properties, y
|
||||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||||
- `model_path` - the path to model file (same with `-m` argument)
|
- `model_path` - the path to model file (same with `-m` argument)
|
||||||
- `chat_template` - the model's original Jinja2 prompt template
|
- `chat_template` - the model's original Jinja2 prompt template
|
||||||
|
- `chat_template_caps` - capabilities of the chat template (see `common/jinja/caps.h` for more info)
|
||||||
- `modalities` - the list of supported modalities
|
- `modalities` - the list of supported modalities
|
||||||
- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)
|
- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)
|
||||||
|
|
||||||
|
|
@ -1267,6 +1269,12 @@ This provides information on the performance of the server. It also allows calcu
|
||||||
|
|
||||||
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
||||||
|
|
||||||
|
*Reasoning support*
|
||||||
|
|
||||||
|
The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API.
|
||||||
|
|
||||||
|
Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994).
|
||||||
|
|
||||||
### POST `/v1/responses`: OpenAI-compatible Responses API
|
### POST `/v1/responses`: OpenAI-compatible Responses API
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
|
||||||
|
|
@ -2903,6 +2903,7 @@ server_context_meta server_context::get_meta() const {
|
||||||
/* pooling_type */ llama_pooling_type(impl->ctx),
|
/* pooling_type */ llama_pooling_type(impl->ctx),
|
||||||
|
|
||||||
/* chat_params */ impl->chat_params,
|
/* chat_params */ impl->chat_params,
|
||||||
|
/* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()),
|
||||||
|
|
||||||
/* bos_token_str */ bos_token_str,
|
/* bos_token_str */ bos_token_str,
|
||||||
/* eos_token_str */ eos_token_str,
|
/* eos_token_str */ eos_token_str,
|
||||||
|
|
@ -3410,6 +3411,7 @@ void server_routes::init_routes() {
|
||||||
{ "webui", params.webui },
|
{ "webui", params.webui },
|
||||||
{ "webui_settings", meta->json_webui_settings },
|
{ "webui_settings", meta->json_webui_settings },
|
||||||
{ "chat_template", tmpl_default },
|
{ "chat_template", tmpl_default },
|
||||||
|
{ "chat_template_caps", meta->chat_template_caps },
|
||||||
{ "bos_token", meta->bos_token_str },
|
{ "bos_token", meta->bos_token_str },
|
||||||
{ "eos_token", meta->eos_token_str },
|
{ "eos_token", meta->eos_token_str },
|
||||||
{ "build_info", meta->build_info },
|
{ "build_info", meta->build_info },
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ struct server_context_meta {
|
||||||
|
|
||||||
// chat params
|
// chat params
|
||||||
server_chat_params & chat_params;
|
server_chat_params & chat_params;
|
||||||
|
std::map<std::string, bool> chat_template_caps;
|
||||||
|
|
||||||
// tokens
|
// tokens
|
||||||
std::string bos_token_str;
|
std::string bos_token_str;
|
||||||
|
|
|
||||||
|
|
@ -700,7 +700,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
|
||||||
json choice {
|
json choice {
|
||||||
{"finish_reason", finish_reason},
|
{"finish_reason", finish_reason},
|
||||||
{"index", index},
|
{"index", index},
|
||||||
{"message", msg.to_json_oaicompat<json>()},
|
{"message", msg.to_json_oaicompat()},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!stream && probs_output.size() > 0) {
|
if (!stream && probs_output.size() > 0) {
|
||||||
|
|
@ -750,7 +750,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
||||||
json {
|
json {
|
||||||
{"finish_reason", nullptr},
|
{"finish_reason", nullptr},
|
||||||
{"index", 0},
|
{"index", 0},
|
||||||
{"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
|
{"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
|
||||||
},
|
},
|
||||||
})},
|
})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
|
|
@ -1383,7 +1383,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & diff : oaicompat_msg_diffs) {
|
for (const auto & diff : oaicompat_msg_diffs) {
|
||||||
add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
|
add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!deltas.empty()) {
|
if (!deltas.empty()) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue