From db38820013c30ea785d975ee99606e8dfcb0ab02 Mon Sep 17 00:00:00 2001 From: Jesse Posner Date: Sat, 14 Feb 2026 23:29:39 -0800 Subject: [PATCH 1/3] common : fix Step-3.5-Flash format detection and thinking support Step-3.5-Flash uses the same XML-style tool call format as Qwen3-Coder () but its Jinja template lacks the bare and plural markers that the detection logic previously required. This caused it to fall through to Hermes 2 Pro, which doesn't call func_args_not_string(), so arguments stayed as JSON strings and templates using arguments|items crashed. Additionally, the Qwen3-Coder-XML format handler had no thinking support. Models like Step-3.5-Flash that unconditionally emit in their generation prompt need the same thinking_forced_open handling that Nemotron v3 and Hermes 2 Pro already have, otherwise reasoning_content is never separated from content in API responses. Changes: - Relax Qwen3-Coder XML detection to only require the 3 shared markers - Tighten Nemotron v3 branch to also require bare and plural , preventing Step-3.5-Flash from being misrouted via - Add thinking_forced_open support to Qwen3-Coder-XML init function - Add / to preserved tokens - Fix build_grammar_xml_tool_call to handle thinking_forced_open in the grammar root rule, allowing before tool calls - Add Step-3.5-Flash chat template and format detection test Builds on: https://github.com/ggml-org/llama.cpp/pull/19283 --- common/chat-parser-xml-toolcall.cpp | 1 + common/chat.cpp | 24 ++++-- .../templates/stepfun-ai-Step-3.5-Flash.jinja | 80 +++++++++++++++++++ tests/test-chat.cpp | 6 ++ 4 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 models/templates/stepfun-ai-Step-3.5-Flash.jinja diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index a80900ff8d..56d8bb410a 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -279,6 +279,7 @@ void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end)); auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end); builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") + tool_call_multiple_with_end + "?" + (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end)) diff --git a/common/chat.cpp b/common/chat.cpp index 47a34d5822..8abb6ba5f1 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1884,7 +1884,18 @@ static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_c data.prompt = apply(tmpl, params); data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML; + // Handle thinking tags (e.g. Step-3.5-Flash unconditionally emits ) + if (string_ends_with(data.prompt, "\n")) { + if (!params.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } + } + data.preserved_tokens = { + "", + "", "", "", ", , and blocks. + // Detect via XML markers: , , and blocks. + // Also matches Step-3.5-Flash which uses the same output format. if (src.find("") != std::string::npos && - src.find("") != std::string::npos && src.find("") != std::string::npos && src.find("") != std::string::npos) { + // Nemotron 3 Nano 30B A3B: also has bare and plural , + // which Step-3.5-Flash lacks despite also having + if (src.find("") != std::string::npos && + src.find("") != std::string::npos && + src.find("") != std::string::npos) { return common_chat_params_init_nemotron_v3(tmpl, params); } return common_chat_params_init_qwen3_coder_xml(tmpl, params); diff --git a/models/templates/stepfun-ai-Step-3.5-Flash.jinja b/models/templates/stepfun-ai-Step-3.5-Flash.jinja new file mode 100644 index 0000000000..c09ea497da --- /dev/null +++ b/models/templates/stepfun-ai-Step-3.5-Flash.jinja @@ -0,0 +1,80 @@ +{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}{% endif %}{% endfor %}{% endif %}{% endmacro %} +{{bos_token}}{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- render_content(messages[0].content) + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou have access to the following functions in JSONSchema format:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson(ensure_ascii=False) }} + {%- endfor %} + {{- "\n\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner \n...\n block must be nested within \n...\n XML tags\n- Required parameters MUST be specified\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- set content = render_content(message.content) %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %} + {{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = render_content(message.reasoning_content) %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- else %} + {%- set reasoning_content = '' %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n' }} + {%- if tool_call.arguments is defined %} + {%- set arguments = tool_call.arguments %} + {%- for args_name, args_value in arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>tool_response\n' }} + {%- endif %} + {{- '' }} + {{- content }} + {{- '' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n\n' }} +{%- endif %} diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 4378a8db71..354b74c8ee 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -3553,6 +3553,12 @@ Hey there!<|im_end|> auto grammar = build_grammar(params.grammar); GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types"); } + + { + // Step-3.5-Flash template (uses same XML format as Qwen3-Coder but lacks and markers) + auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja"); + assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, common_chat_templates_apply(tmpls.get(), inputs_tools).format); + } } static void test_template_output_peg_parsers() { From ac0f256df0791e5f14cd79b64b8e2ae1ad8b44f8 Mon Sep 17 00:00:00 2001 From: Jesse Posner Date: Sun, 15 Feb 2026 09:24:15 -0800 Subject: [PATCH 2/3] chat : route Step-3.5-Flash to Nemotron v3 PEG parser, add tests Step-3.5-Flash uses the same XML tool call format as Qwen3-Coder and Nemotron 3 Nano (//) but with unconditional output. Route it to the Nemotron v3 PEG parser for streaming and schema-aware parameter parsing. Detection: templates with + XML tool tags use Nemotron v3 PEG parser; templates without (Qwen3-Coder) use GBNF grammar. Tests cover: basic messages, tool calls with/without thinking content, parallel tool calls, code string parameters, optional closing tags, and JSON schema response format. --- common/chat.cpp | 11 ++- tests/test-chat.cpp | 211 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 214 insertions(+), 8 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 8abb6ba5f1..d87cb682c5 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -3141,16 +3141,15 @@ static common_chat_params common_chat_templates_apply_jinja( // Qwen3-Coder XML format detection (must come before Hermes 2 Pro) // Detect via XML markers: , , and blocks. - // Also matches Step-3.5-Flash which uses the same output format. + // Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format. if (src.find("") != std::string::npos && src.find("") != std::string::npos && - src.find("") != std::string::npos && - src.find("") != std::string::npos) { + // Models with support (Step-3.5-Flash, Nemotron 3 Nano) use the + // Nemotron v3 PEG parser for streaming and schema-aware parameter parsing. + // Qwen3-Coder has no in its template. + if (src.find("") != std::string::npos) { return common_chat_params_init_nemotron_v3(tmpl, params); } return common_chat_params_init_qwen3_coder_xml(tmpl, params); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 354b74c8ee..1bf6e4d9ec 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -3555,9 +3555,26 @@ Hey there!<|im_end|> } { - // Step-3.5-Flash template (uses same XML format as Qwen3-Coder but lacks and markers) + // Step-3.5-Flash template: uses same XML output format as Qwen3-Coder and Nemotron v3, + // but with support. Routes to the Nemotron v3 PEG parser for streaming and + // schema-aware parameter parsing. auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja"); - assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, common_chat_templates_apply(tmpls.get(), inputs_tools).format); + assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, common_chat_templates_apply(tmpls.get(), inputs_tools).format); + + // Grammar and PEG parser should be generated with thinking_forced_open + { + common_chat_templates_inputs inputs; + inputs.messages = { message_user }; + inputs.tools = { special_function_tool }; + inputs.enable_thinking = true; + auto params = common_chat_templates_apply(tmpls.get(), inputs); + assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, params.format); + assert_equals(true, params.thinking_forced_open); + assert_equals(false, params.grammar.empty()); + assert_equals(false, params.parser.empty()); + auto grammar = build_grammar(params.grammar); + GGML_ASSERT(grammar && "Failed to build Step-3.5-Flash grammar with thinking_forced_open"); + } } } @@ -3805,6 +3822,196 @@ static void test_template_output_peg_parsers() { }); } + { + // Step-3.5-Flash (uses Nemotron v3 PEG parser with thinking_forced_open) + // Unlike Nemotron, Step-3.5-Flash always emits regardless of enable_thinking, + // so all inputs must include a delimiter. + auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja"); + + // Test basic message with reasoning + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = "I'm\nthinking\n\nHello, world!\nWhat's up?"; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + + t.expect = message_assist_thoughts; + }); + + // Test basic message without thinking content + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = "\nHello, world!\nWhat's up?"; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + + t.expect = message_assist; + }); + + // Test tool call without thinking content + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {special_function_tool}; + + t.expect = message_assist_call; + }); + + // Test tool call with thinking + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "I'm\nthinking\n\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {special_function_tool}; + + t.expect = message_assist_call_thoughts; + }); + + // Test parallel tool calls with thinking + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "I'm\nthinking\n\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "2\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.parallel_tool_calls = true; + t.params.tools = {special_function_tool, special_function_tool_with_optional_param}; + + t.expect.reasoning_content = "I'm\nthinking"; + t.expect.tool_calls = {{ + /* .name = */ "special_function", + /* .arguments = */ R"({"arg1": 1})", + /* .id = */ {}, + }, { + /* .name = */ "special_function_with_opt", + /* .arguments = */ R"({"arg1": 1, "arg2": 2})", + /* .id = */ {}, + }}; + }); + + // Test parallel tool calls without thinking content + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "1\n" + "\n" + "\n" + "2\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.parallel_tool_calls = true; + t.params.tools = {special_function_tool, special_function_tool_with_optional_param}; + + t.expect.tool_calls = {{ + /* .name = */ "special_function", + /* .arguments = */ R"({"arg1": 1})", + /* .id = */ {}, + }, { + /* .name = */ "special_function_with_opt", + /* .arguments = */ R"({"arg1": 1, "arg2": 2})", + /* .id = */ {}, + }}; + }); + + // Test tool call with code string parameter + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "\n" + "def hello():\n" + " print(\"Hello, world!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {python_tool}; + + t.expect.tool_calls = {{ + /* .name = */ "python", + /* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", + /* .id = */ {}, + }}; + }); + + // Test tool call with string parameter and no closing tag + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "\n" + "\n" + "\n" + "\n" + "def hello():\n" + " print(\"Hello, world!\")\n" + "\n" + "hello()\n" + "\n" + ""; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.tools = {python_tool}; + + t.expect.tool_calls = {{ + /* .name = */ "python", + /* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", + /* .id = */ {}, + }}; + }); + + // Test response format (JSON schema with thinking) + test_peg_parser(tmpls.get(), [&](auto & t) { + t.input = + "I need to output the invoice details in JSON\n" + "\n" + R"({"amount": 123.45, "date": "2025-12-03"})"; + t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + t.params.json_schema = invoice_schema; + + t.expect.reasoning_content = "I need to output the invoice details in JSON"; + t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})"; + }); + } + { // Solar-Open-100B auto tmpls = read_templates("models/templates/upstage-Solar-Open-100B.jinja"); From bdc1dda64f80393af9255128dcdf70ea08e3e046 Mon Sep 17 00:00:00 2001 From: Jesse Posner Date: Sun, 15 Feb 2026 22:25:12 -0800 Subject: [PATCH 3/3] chat : remove dead thinking code from qwen3_coder_xml Remove thinking handling code that became unreachable after routing Step-3.5-Flash to the Nemotron v3 PEG parser. Qwen3-Coder has no in its template, so the thinking_forced_open logic, preserved tokens, and grammar prefix were dead paths. --- common/chat-parser-xml-toolcall.cpp | 1 - common/chat.cpp | 11 ----------- tests/test-chat.cpp | 3 +-- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index 56d8bb410a..a80900ff8d 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -279,7 +279,6 @@ void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end)); auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end); builder.add_rule("root", - std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") + tool_call_multiple_with_end + "?" + (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end)) diff --git a/common/chat.cpp b/common/chat.cpp index d87cb682c5..04fe8bc070 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1884,18 +1884,7 @@ static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_c data.prompt = apply(tmpl, params); data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML; - // Handle thinking tags (e.g. Step-3.5-Flash unconditionally emits ) - if (string_ends_with(data.prompt, "\n")) { - if (!params.enable_thinking) { - data.prompt += ""; - } else { - data.thinking_forced_open = true; - } - } - data.preserved_tokens = { - "", - "", "", "", " common_chat_templates_inputs inputs; inputs.messages = { message_user }; inputs.tools = { special_function_tool }; - inputs.enable_thinking = true; auto params = common_chat_templates_apply(tmpls.get(), inputs); assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, params.format); assert_equals(true, params.thinking_forced_open); assert_equals(false, params.grammar.empty()); assert_equals(false, params.parser.empty()); auto grammar = build_grammar(params.grammar); - GGML_ASSERT(grammar && "Failed to build Step-3.5-Flash grammar with thinking_forced_open"); + GGML_ASSERT(grammar && "Failed to build Step-3.5-Flash grammar"); } } }