diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index 2ef744278a..cc408a6869 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to break; } case REASONING_BUDGET_FORCING: - // force_pos is advanced in apply(), not here. - // This ensures the first forced token isn't skipped when the sampler - // is initialized directly in FORCING state (e.g. COUNTING + budget=0) + ctx->force_pos++; + if (ctx->force_pos >= ctx->forced_tokens.size()) { + ctx->state = REASONING_BUDGET_DONE; + LOG_INF("reasoning-budget: forced sequence complete, done\n"); + } break; case REASONING_BUDGET_DONE: break; @@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok cur_p->data[i].logit = -INFINITY; } } - - // advance to next forced token (done here rather than in accept so that - // the first forced token isn't skipped when starting in FORCING state) - ctx->force_pos++; - if (ctx->force_pos >= ctx->forced_tokens.size()) { - ctx->state = REASONING_BUDGET_DONE; - LOG_INF("reasoning-budget: forced sequence complete, done\n"); - } } static void common_reasoning_budget_reset(struct llama_sampler * smpl) { @@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init( common_reasoning_budget_state initial_state) { return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state); } + +common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) { + if (!smpl) { + return REASONING_BUDGET_IDLE; + } + return ((const common_reasoning_budget_ctx *)smpl->ctx)->state; +} diff --git a/common/reasoning-budget.h b/common/reasoning-budget.h index 130afdea4a..ee1a30ed3c 100644 --- a/common/reasoning-budget.h +++ b/common/reasoning-budget.h @@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init( const std::vector & forced_tokens, int32_t budget, common_reasoning_budget_state initial_state); + +common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl); diff --git a/common/sampling.cpp b/common/sampling.cpp index 012e212660..5259c5f3c6 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -109,6 +110,7 @@ struct common_sampler { common_params_sampling params; struct llama_sampler * grmr; + struct llama_sampler * rbudget; struct llama_sampler * chain; ring_buffer prev; @@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st lparams.no_perf = params.no_perf; llama_sampler * grmr = nullptr; + llama_sampler * rbudget = nullptr; llama_sampler * chain = llama_sampler_chain_init(lparams); std::vector samplers; @@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } } - if (grmr) { + if (grmr && !params.grammar_lazy) { try { for (const auto & token : prefill_tokens) { llama_sampler_accept(grmr, token); @@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } } - // reasoning budget sampler — added first so it can force tokens before other samplers - if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) { - samplers.push_back(common_reasoning_budget_init( + // reasoning budget sampler + if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) { + rbudget = common_reasoning_budget_init( vocab, params.reasoning_budget_start, params.reasoning_budget_end, params.reasoning_budget_forced, - params.reasoning_budget_tokens, - prefill_tokens)); + params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens, + prefill_tokens); } if (params.has_logit_bias()) { @@ -383,6 +386,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st auto * result = new common_sampler { /* .params = */ params, /* .grmr = */ grmr, + /* .rbudget = */ rbudget, /* .chain = */ chain, /* .prev = */ ring_buffer(std::max(32, params.n_prev)), /* .cur = */ {}, @@ -398,11 +402,27 @@ void common_sampler_free(struct common_sampler * gsmpl) { } llama_sampler_free(gsmpl->grmr); + llama_sampler_free(gsmpl->rbudget); llama_sampler_free(gsmpl->chain); delete gsmpl; } +static bool grammar_should_apply(struct common_sampler * gsmpl) { + if (!gsmpl->grmr) { + return false; + } + if (!gsmpl->rbudget) { + return true; + } + if (gsmpl->params.grammar_lazy) { + // if grammar is lazy, only apply when reasoning budget is not active + const auto state = common_reasoning_budget_get_state(gsmpl->rbudget); + return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE; + } + return true; +} + void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { if (!gsmpl) { return; @@ -410,6 +430,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo const auto tm = gsmpl->tm(); + // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept + accept_grammar = accept_grammar && grammar_should_apply(gsmpl); + + llama_sampler_accept(gsmpl->rbudget, token); + if (gsmpl->grmr && accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -431,6 +456,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { return new common_sampler { /* .params = */ gsmpl->params, /* .grmr = */ llama_sampler_clone(gsmpl->grmr), + /* .rbudget = */ llama_sampler_clone(gsmpl->rbudget), /* .chain = */ llama_sampler_clone(gsmpl->chain), /* .prev = */ gsmpl->prev, /* .cur = */ gsmpl->cur, @@ -500,6 +526,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co llama_token id = LLAMA_TOKEN_NULL; auto & grmr = gsmpl->grmr; + auto & rbudget = gsmpl->rbudget; auto & chain = gsmpl->chain; auto & cur_p = gsmpl->cur_p; // initialized by set_logits @@ -511,7 +538,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co if (id != LLAMA_TOKEN_NULL) { LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id); - GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported"); + GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported"); + GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported"); // TODO: simplify gsmpl->cur.resize(1); @@ -524,7 +552,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co gsmpl->set_logits(ctx, idx); - if (grammar_first) { + // apply reasoning budget first + llama_sampler_apply(rbudget, &cur_p); + + if (grammar_first && grammar_should_apply(gsmpl)) { llama_sampler_apply(grmr, &cur_p); } @@ -532,7 +563,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co id = cur_p.data[cur_p.selected].id; - if (grammar_first) { + if (grammar_first || !grammar_should_apply(gsmpl)) { return id; } @@ -553,7 +584,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain gsmpl->set_logits(ctx, idx); - llama_sampler_apply(grmr, &cur_p); + llama_sampler_apply(rbudget, &cur_p); + + if (grammar_should_apply(gsmpl)) { + llama_sampler_apply(grmr, &cur_p); + } + llama_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 575d240791..74f078f5ed 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -936,75 +936,158 @@ static void test_peg_parser(common_chat_templates * tmpls, throw std::runtime_error("Failed to build grammar: " + parser.params_.grammar); } - // Find the earliest trigger position to determine the constrained portion - auto earliest_trigger_pos = std::string::npos; - for (const auto & trigger : parser.params_.grammar_triggers) { - size_t pos = std::string::npos; - std::smatch match; - switch (trigger.type) { - case COMMON_GRAMMAR_TRIGGER_TYPE_WORD: - { - const auto & word = trigger.value; - pos = tc.input.find(word); - break; - } - case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN: - { - const auto & pattern = std::regex(trigger.value); - if (std::regex_search(tc.input, match, pattern)) { - pos = match.position(pattern.mark_count()); - } - break; - } - case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL: - { - const auto & pattern = trigger.value; - if (std::regex_match(tc.input, match, std::regex(pattern))) { - auto mpos = std::string::npos; - for (size_t i = 1; i < match.size(); ++i) { - if (match[i].length() > 0) { - mpos = match.position(i); - break; - } - } - if (mpos == std::string::npos) { - mpos = match.position(0); - } - pos = mpos; - } - break; - } - default: - throw std::runtime_error("Unknown trigger type"); - } - if (pos != std::string::npos) { - if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) { - earliest_trigger_pos = pos; + // In production, grammar triggers match against the full generated text + // including the generation prompt. All positions are in full_input coordinates. + const auto & gen_prompt = parser.params_.generation_prompt; + std::string full_input = gen_prompt + tc.input; + + // Determine whether the reasoning-budget sampler path applies: tool-call grammar + // with all WORD triggers and thinking tags present. In production, the reasoning + // budget sampler inhibits grammar application while inside thinking blocks — + // triggers inside ... are suppressed. + bool use_reasoning_budget_path = false; + if (parser.params_.grammar_lazy && !parser.params_.thinking_end_tag.empty()) { + use_reasoning_budget_path = true; + for (const auto & trigger : parser.params_.grammar_triggers) { + if (trigger.type != COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { + use_reasoning_budget_path = false; + break; } } } - // Determine the constrained portion of input to test against grammar - std::string constrained = tc.input; + // Find the earliest trigger position to determine the constrained portion + auto earliest_trigger_pos = std::string::npos; + + if (use_reasoning_budget_path) { + // Reasoning-budget path: simulate thinking-aware trigger detection. + // Walk through full_input tracking thinking state; only match triggers + // when outside thinking blocks. + const auto & think_start = parser.params_.thinking_start_tag; + const auto & think_end = parser.params_.thinking_end_tag; + + bool in_thinking = false; + for (size_t i = 0; i < full_input.size(); ++i) { + if (!in_thinking && !think_start.empty() + && full_input.compare(i, think_start.size(), think_start) == 0) { + in_thinking = true; + i += think_start.size() - 1; + continue; + } + if (in_thinking && full_input.compare(i, think_end.size(), think_end) == 0) { + in_thinking = false; + i += think_end.size() - 1; + continue; + } + if (in_thinking) { + continue; + } + // Outside thinking — check if any trigger word starts here + for (const auto & trigger : parser.params_.grammar_triggers) { + if (full_input.compare(i, trigger.value.size(), trigger.value) == 0) { + if (earliest_trigger_pos == std::string::npos || i < earliest_trigger_pos) { + earliest_trigger_pos = i; + } + } + } + if (earliest_trigger_pos != std::string::npos) { + break; // found the earliest + } + } + + // If the reasoning-budget path found no trigger outside thinking but the test + // expects tool calls, this template nests tool calls inside thinking + // blocks (e.g. Kimi). Fall back to the legacy path for this case. + if (earliest_trigger_pos == std::string::npos && !tc.expect.tool_calls.empty()) { + use_reasoning_budget_path = false; + } + } + + if (!use_reasoning_budget_path) { + // Legacy path: find triggers without thinking-awareness + for (const auto & trigger : parser.params_.grammar_triggers) { + size_t pos = std::string::npos; + std::smatch match; + switch (trigger.type) { + case COMMON_GRAMMAR_TRIGGER_TYPE_WORD: + { + const auto & word = trigger.value; + pos = full_input.find(word); + break; + } + case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN: + { + const auto & compiled = std::regex(trigger.value); + if (std::regex_search(full_input, match, compiled)) { + pos = match.position(compiled.mark_count()); + } + break; + } + case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL: + { + // In production, PATTERN_FULL triggers are checked against + // the text generated so far, growing token by token. Simulate + // by trying every prefix of full_input. + const auto & compiled = std::regex(trigger.value); + for (size_t end = gen_prompt.size(); end <= full_input.size(); ++end) { + std::string prefix = full_input.substr(0, end); + if (std::regex_match(prefix, match, compiled)) { + pos = std::string::npos; + for (size_t gi = 1; gi < match.size(); ++gi) { + if (match[gi].length() > 0) { + pos = match.position(gi); + break; + } + } + if (pos == std::string::npos) { + pos = match.position(0); + } + break; + } + } + break; + } + default: + throw std::runtime_error("Unknown trigger type"); + } + if (pos != std::string::npos) { + if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) { + earliest_trigger_pos = pos; + } + } + } + } + + // If the test expects tool calls and the grammar is lazy, the trigger must fire. + // Otherwise the grammar would never activate in production and tool calls wouldn't + // be constrained. A silent skip here would hide broken triggers. + if (parser.params_.grammar_lazy && !tc.expect.tool_calls.empty() && !tc.is_partial + && earliest_trigger_pos == std::string::npos) { + std::string trigger_desc; + for (const auto & trigger : parser.params_.grammar_triggers) { + trigger_desc += "\n [type=" + std::to_string(trigger.type) + "] " + trigger.value; + } + throw std::runtime_error( + "Grammar trigger did not fire, but test expects tool calls (lazy grammar).\n" + ">>> Input: " + full_input + "\n" + ">>> Triggers (" + std::to_string(parser.params_.grammar_triggers.size()) + "):" + trigger_desc); + } + + // Determine the constrained portion of input to test against grammar. + // If the trigger position falls inside the generation prompt, the grammar + // sampler was already active before model output began — constrain from the + // start of the model output (i.e. tc.input). + std::string constrained = full_input; bool grammar_triggered = false; if (earliest_trigger_pos != std::string::npos) { - constrained = tc.input.substr(earliest_trigger_pos); + auto constrain_from = std::max(earliest_trigger_pos, gen_prompt.size()); + constrained = full_input.substr(constrain_from); grammar_triggered = true; } else if (!parser.params_.grammar_lazy) { // For non-lazy grammars, the entire input should match grammar_triggered = true; } - // For non-lazy grammars, prepend reasoning prefill to grammar input, just like - // PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional - // ...), but the model output may start mid-reasoning if the template - // already placed the opening tag in the prompt. - // For lazy grammars, the grammar only activates from the trigger position, so the - // reasoning prefill is irrelevant — reasoning is handled by the PEG parser. - if (!parser.params_.generation_prompt.empty() && earliest_trigger_pos == std::string::npos) { - constrained = parser.params_.generation_prompt + constrained; - } - // Test the constrained portion against the grammar if (grammar_triggered && !tc.is_partial) { auto result = match_string_detailed(constrained, grammar.get()); @@ -1323,6 +1406,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_reasoning("I need to output the invoice details in JSON") .expect_content(R"({"amount": 123.45, "date": "2025-12-03"})") .run(); + + // fake tool call marker in reasoning + tst.test( + "[THINK]Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more[/THINK]" + R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ special_function_tool }) + .expect_reasoning("Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more") + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + }) + .run(); } { @@ -1425,6 +1521,50 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_reasoning("I need to output the invoice details in JSON") .expect_content(R"({"amount": 123.45, "date": "2025-12-03"})") .run(); + + // tool call segment in reasoning + tst.test( + "Let's call a tool: \n" + "\n" + "\n" + "def hello():\n" + " print(\"Not the real call!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "def hello():\n" + " print(\"Hello, world!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + "" + ) + .enable_thinking(true) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + python_tool + }) + .expect_reasoning("Let's call a tool: \n" + "\n" + "\n" + "def hello():\n" + " print(\"Not the real call!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + "") + .expect_tool_calls({ + { "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} }, + }) + .run(); + } { @@ -2297,6 +2437,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ empty_args_tool }) .expect(simple_assist_msg("", "", "empty_args", "{}")) .run(); + + // fake tool call marker in reasoning + tst.test( + "Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm" + "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") + .enable_thinking(true) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ special_function_tool }) + .expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm") + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + }) + .run(); } // Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index ab540a8463..3028fb4d8f 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -61,8 +61,6 @@ static void test_reasoning_budget( // Feed the sequence and track when forcing occurs for (size_t i = 0; i < sequence.size(); i++) { - llama_sampler_accept(sampler, sequence[i]); - // Check if we're in forcing state by applying and seeing if logits are modified cur_p.selected = -1; for (size_t j = 0; j < cur.size(); j++) { @@ -81,6 +79,8 @@ static void test_reasoning_budget( } } + llama_sampler_accept(sampler, sequence[i]); + fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token); if (finite_count == 1) { @@ -167,9 +167,9 @@ int main(void) { } // Test 2: Budget exhausted, forcing should occur - // Flow: i=0 accept(100)->COUNTING, i=1 accept(50)->remaining=1, i=2 accept(51)->remaining=0->FORCING - // Forcing is active at i=2 and i=3 (when apply() is called while in FORCING state) - // At i=4, force_pos becomes 2 which equals forced_tokens.size(), so state becomes DONE + // Flow: i=0 apply()->passthrough, accept(100)->COUNTING; i=1 accept(50)->remaining=1 + // i=2 accept(51)->remaining=0->FORCING; i=3 apply() forces token[0]; i=4 apply() forces token[1] + // At i=4, accept() advances force_pos to 2 which equals forced_tokens.size(), so state becomes DONE { const std::vector start = {100}; const std::vector end = {101}; @@ -179,13 +179,12 @@ int main(void) { test_reasoning_budget("budget exhausted forcing", sequence, start, end, forced, 2, // budget of 2 tokens REASONING_BUDGET_IDLE, - 2, // forcing starts at i=2 (after accept(51) depletes budget, apply() forces) - 3); // forcing continues through i=3 (at i=4 state becomes DONE) + 3, // forcing starts at i=3 (accept at i=2 depletes budget, apply at i=3 forces) + 4); // forcing continues through i=4 (accept at i=4 transitions to DONE) } // Test 3: Activate immediately with budget=0, forcing should start right away - // Flow: Since no start token in sequence, state stays IDLE (no start/end configured means passthrough) - // This test needs start token to be in the sequence or use activate_immediately with start token present + // Flow: init promotes COUNTING+budget=0 to FORCING, so apply() sees FORCING at i=0 { const std::vector start = {100}; const std::vector end = {101}; @@ -195,8 +194,8 @@ int main(void) { test_reasoning_budget("activate immediately budget=0", sequence, start, end, forced, 0, // budget of 0 tokens REASONING_BUDGET_COUNTING, // starts counting, promoted to FORCING since budget=0 - 0, // forcing starts at i=0 (after accept(100), budget=0 goes straight to FORCING) - 1); // forcing continues through i=1 (at i=2 state becomes DONE) + 0, // forcing starts at i=0 (initialized in FORCING, apply forces immediately) + 1); // forcing continues through i=1 (accept at i=1 transitions to DONE) } // Test 4: No start/end tokens configured - passthrough (no forcing) @@ -214,7 +213,7 @@ int main(void) { // Test 5: Activate immediately with budget > 0, count down then force // Flow: i=0 accept(50)->remaining=1, i=1 accept(51)->remaining=0->FORCING - // So forcing starts at i=1 (apply after accept sees FORCING with force_pos=0) + // Forcing starts at i=2 (apply sees FORCING after accept at i=1 transitioned) { const std::vector start = {100}; const std::vector end = {101}; @@ -224,8 +223,8 @@ int main(void) { test_reasoning_budget("activate immediately with budget", sequence, start, end, forced, 2, // budget of 2 tokens REASONING_BUDGET_COUNTING, - 1, // forcing starts at i=1 (after 2 accepts deplete budget) - 2); // forcing continues through i=2 + 2, // forcing starts at i=2 (after 2 accepts deplete budget, apply at i=2 forces) + 3); // forcing continues through i=3 } printf("OK (5 tests passed)\n"); diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 65d14e9728..f5b4426f6f 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -100,7 +100,7 @@ struct cli_context { } // reasoning budget sampler - if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) { + if (!chat_params.thinking_end_tag.empty()) { const llama_vocab * vocab = llama_model_get_vocab( llama_get_model(ctx_server.get_llama_context())); diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index e01c8c53df..ed5e306fc5 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1110,7 +1110,7 @@ json oaicompat_chat_params_parse( reasoning_budget = json_value(body, "thinking_budget_tokens", -1); } - if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) { + if (!chat_params.thinking_end_tag.empty()) { llama_params["reasoning_budget_tokens"] = reasoning_budget; llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag; llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 7d543b9292..3018ac90f8 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -478,19 +478,17 @@ task_params server_task::params_from_json_cmpl( // Parse reasoning budget sampler parameters { const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1); - if (budget >= 0) { - const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string()); - const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); - const auto message = json_value(data, "reasoning_budget_message", std::string()); - params.sampling.reasoning_budget_tokens = budget; + const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string()); + const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); + const auto message = json_value(data, "reasoning_budget_message", std::string()); + params.sampling.reasoning_budget_tokens = budget; - if (!start_tag.empty()) { - params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); - } - if (!end_tag.empty()) { - params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true); - params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true); - } + if (!start_tag.empty()) { + params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); + } + if (!end_tag.empty()) { + params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true); + params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true); SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n", budget, params.sampling.generation_prompt.c_str(),