From ea5ed7ab73b47e793d263d9d65f11b9749a6a9a0 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sun, 15 Mar 2026 20:00:17 +0100 Subject: [PATCH] Refactor cli parameters, update docs, move reasoning budget sampler part to common/reasoning-budget.cpp --- common/reasoning-budget.cpp | 60 ++++++++++++-- common/reasoning-budget.h | 26 ++++-- common/sampling.cpp | 26 +----- docs/autoparser.md | 79 ++++++++++--------- tools/server/README.md | 6 +- .../services/parameter-sync.service.spec.ts | 2 +- tools/server/webui/src/lib/types/api.d.ts | 2 +- 7 files changed, 117 insertions(+), 84 deletions(-) diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index a55e4f509d..2ef744278a 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -163,9 +163,15 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) { ctx->force_pos = 0; } +// forward declaration for use in clone +static struct llama_sampler * common_reasoning_budget_init_state( + const struct llama_vocab * vocab, const std::vector & start_tokens, + const std::vector & end_tokens, const std::vector & forced_tokens, + int32_t budget, common_reasoning_budget_state initial_state); + static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; - return common_reasoning_budget_init( + return common_reasoning_budget_init_state( ctx->vocab, ctx->start_matcher.tokens, ctx->end_matcher.tokens, @@ -191,13 +197,13 @@ static struct llama_sampler_i common_reasoning_budget_i = { /* .backend_set_input = */ nullptr, }; -struct llama_sampler * common_reasoning_budget_init( - const struct llama_vocab * vocab, - const std::vector & start_tokens, - const std::vector & end_tokens, - const std::vector & forced_tokens, - int32_t budget, - common_reasoning_budget_state initial_state) { +static struct llama_sampler * common_reasoning_budget_init_state( + const struct llama_vocab * vocab, + const std::vector & start_tokens, + const std::vector & end_tokens, + const std::vector & forced_tokens, + int32_t budget, + common_reasoning_budget_state initial_state) { // promote COUNTING with budget <= 0 to FORCING if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) { initial_state = REASONING_BUDGET_FORCING; @@ -217,3 +223,41 @@ struct llama_sampler * common_reasoning_budget_init( } ); } + +struct llama_sampler * common_reasoning_budget_init( + const struct llama_vocab * vocab, + const std::vector & start_tokens, + const std::vector & end_tokens, + const std::vector & forced_tokens, + int32_t budget, + const std::vector & prefill_tokens) { + // Determine initial state from prefill: COUNTING if the prefill begins with + // the start sequence but does not also contain the end sequence after it. + common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE; + if (!prefill_tokens.empty() && !start_tokens.empty() && + prefill_tokens.size() >= start_tokens.size() && + std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) { + initial_state = REASONING_BUDGET_COUNTING; + // If the end sequence also follows the start in the prefill, reasoning + // was opened and immediately closed — stay IDLE. + if (!end_tokens.empty() && + prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) { + auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size(); + if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() && + std::equal(end_tokens.begin(), end_tokens.end(), end_start)) { + initial_state = REASONING_BUDGET_IDLE; + } + } + } + return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state); +} + +struct llama_sampler * common_reasoning_budget_init( + const struct llama_vocab * vocab, + const std::vector & start_tokens, + const std::vector & end_tokens, + const std::vector & forced_tokens, + int32_t budget, + common_reasoning_budget_state initial_state) { + return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state); +} diff --git a/common/reasoning-budget.h b/common/reasoning-budget.h index 08ad282481..130afdea4a 100644 --- a/common/reasoning-budget.h +++ b/common/reasoning-budget.h @@ -24,14 +24,26 @@ enum common_reasoning_budget_state { // DONE: passthrough forever // // Parameters: -// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr) -// start_tokens - token sequence that activates counting -// end_tokens - token sequence for natural deactivation -// forced_tokens - token sequence forced when budget expires -// budget - max tokens allowed in the reasoning block -// initial_state - initial state of the sampler (e.g. IDLE or COUNTING) -// note: COUNTING with budget <= 0 is promoted to FORCING +// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr) +// start_tokens - token sequence that activates counting +// end_tokens - token sequence for natural deactivation +// forced_tokens - token sequence forced when budget expires +// budget - max tokens allowed in the reasoning block +// prefill_tokens - tokens already present in the prompt (generation prompt); +// used to determine the initial state: COUNTING if they begin +// with start_tokens (but don't also end with end_tokens), +// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING. // +struct llama_sampler * common_reasoning_budget_init( + const struct llama_vocab * vocab, + const std::vector & start_tokens, + const std::vector & end_tokens, + const std::vector & forced_tokens, + int32_t budget, + const std::vector & prefill_tokens = {}); + +// Variant that takes an explicit initial state (used by tests and clone). +// COUNTING with budget <= 0 is promoted to FORCING. struct llama_sampler * common_reasoning_budget_init( const struct llama_vocab * vocab, const std::vector & start_tokens, diff --git a/common/sampling.cpp b/common/sampling.cpp index 6237e40a4c..f03ed4a11d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -276,37 +276,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st // reasoning budget sampler — added first so it can force tokens before other samplers if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) { - // Determine initial state from the grammar prefill: if the prefill tokens - // match the start sequence, reasoning is already open → start COUNTING. - bool activate_immediately = false; - if (!prefill_tokens.empty() && !params.reasoning_budget_start.empty() && - prefill_tokens.size() >= params.reasoning_budget_start.size()) { - activate_immediately = std::equal( - params.reasoning_budget_start.begin(), - params.reasoning_budget_start.end(), - prefill_tokens.begin()); - // But if the prefill also contains the end sequence after the start, - // reasoning was opened and closed — start IDLE instead. - if (activate_immediately && - prefill_tokens.size() >= params.reasoning_budget_start.size() + params.reasoning_budget_end.size()) { - auto end_begin = prefill_tokens.begin() + (ptrdiff_t) params.reasoning_budget_start.size(); - // Check if remaining tokens after start match the end sequence - // (possibly with whitespace tokens in between, but for simplicity check suffix) - auto end_start = prefill_tokens.end() - (ptrdiff_t) params.reasoning_budget_end.size(); - if (end_start >= end_begin && - std::equal(params.reasoning_budget_end.begin(), params.reasoning_budget_end.end(), end_start)) { - activate_immediately = false; - } - } - } - samplers.push_back(common_reasoning_budget_init( vocab, params.reasoning_budget_start, params.reasoning_budget_end, params.reasoning_budget_forced, params.reasoning_budget_tokens, - activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE)); + prefill_tokens)); } if (params.has_logit_bias()) { diff --git a/docs/autoparser.md b/docs/autoparser.md index 7ba8f459f1..adc4d43ed6 100644 --- a/docs/autoparser.md +++ b/docs/autoparser.md @@ -14,7 +14,7 @@ The unified auto-parser uses a pure differential, compositional approach (inspir **Analysis + Parser Building in Two Steps**: 1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs -2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar +2. `autoparser::peg_generator::generate_parser(tmpl, generation_params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar ## Data Structures @@ -34,7 +34,7 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h ### `analyze_tools` and its sub-structs -- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`) +- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`) - [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names - [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator` - [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values @@ -50,13 +50,17 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h | `TAG_BASED` | Tag-based: `...` (start can be empty for delimiter-style formats) | | `TOOLS_ONLY` | Reasoning only appears in tool call responses, not plain content | -**Reasoning Prefill**: Extracted in `generate_parser()` using `compare_variants(add_generation_prompt=false, add_generation_prompt=true)` on a minimal single-user-message input to get exactly what the template appends as its generation prompt. The start marker is then located within this suffix using `rfind`. Three outcomes: +**Generation Prompt & Reasoning Prefill**: Computed in `common_chat_templates_apply_jinja` before invoking either the specialized handlers or the auto-parser, by rendering the template twice — once with `add_generation_prompt=false` and once with `add_generation_prompt=true` — and storing the diff suffix as `generation_params::generation_prompt`. This string is propagated into `common_chat_params::generation_prompt` and `common_chat_parser_params::generation_prompt`. -1. **Start+end in generation prompt** (e.g. `\n`): `prefill = start + end`. The parser sees reasoning as opened and immediately closed. -2. **Only start in generation prompt** (e.g. `\n`): `prefill = from_start` (substring from the marker's position to the end, preserving whitespace). The parser sees reasoning as already open. -3. **Start marker in the generation prompt but not at its end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact. The start literal is cleared from the parser so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns. +The generation prompt is prepended to model output before PEG parsing via `wrap_for_generation_prompt()`. The portion *before* the reasoning start marker (if any) is prepended as a literal to ensure any boilerplate added by the template is consumed. The full string is also fed to the grammar sampler via `llama_sampler_accept` (stored in `common_params_sampling::grammar_prefill`), advancing the grammar past tokens already in the prompt. It is used to determine the reasoning budget sampler's initial state — COUNTING if the prefill tokens begin with the reasoning start sequence (but don't also contain the end sequence), IDLE otherwise. -The prefill is prepended to model output before PEG parsing, fed to the grammar sampler via `llama_sampler_accept`, and used to determine the reasoning budget sampler's initial state (COUNTING if prefill starts with the reasoning start tokens, IDLE otherwise). +**`grammar_prefill`** (`common_params_sampling`): The generation prompt string tokenized and accepted by the grammar sampler at init time. Only applied when `grammar_external` is false (i.e., the grammar was not set explicitly by the user). + +Three outcomes for reasoning-prefill handling (in `generate_parser()`): + +1. **Start+end in generation prompt** (e.g. `\n`): the parser sees reasoning as opened and immediately closed; whitespace-only reasoning content is discarded. +2. **Only start in generation prompt** (e.g. `\n`): the parser sees reasoning as already open. +3. **Start marker present but not at the end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact; the start literal is cleared so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns. **`content_mode`**: How the template wraps assistant content. @@ -273,10 +277,9 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt. -- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED` -- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED` -- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers -- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing +- Detects template-added reasoning markers: `enable_thinking=true` appends a non-empty marker → sets `reasoning.start`, mode = `TAG_BASED` +- Handles the reverse case (`enable_thinking=false` appends the marker instead): extracts both start (from the preceding segment) and end markers; mode = `TAG_BASED` +- The reasoning prefill (markers added by the template) is later extracted in `common_chat_templates_apply_jinja` and prepended to model output before parsing **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls. @@ -349,7 +352,7 @@ Classification logic: A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds: -1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('')`: sets `reasoning.mode = TAG_BASED` with ``/`` markers if no reasoning was detected +1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('')` but not ``: sets `reasoning.mode = TAG_BASED` with ``/`` markers if no reasoning was detected 2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with ``/`` and `WRAPPED_WITH_REASONING` content with ``/`` 3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set 4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers @@ -361,13 +364,13 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i #### Reasoning Parser (`analyze_reasoning::build_parser`) -| Mode | Parser | -|-----------------------------------|---------------------------------------------------------------------| -| Not extracting reasoning | `eps()` | -| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)` | -| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end)` — delimiter-style| +| Mode | Parser | +|-----------------------------------------------|---------------------------------------------------------------------------| +| Not extracting reasoning | `eps()` | +| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end + space())` | +| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end + space())` — delimiter-style | -Note: The start marker may be empty either because the analyzer detected delimiter-style reasoning, or because `generate_parser()` cleared a template artifact start marker (see Reasoning Prefill above). The reasoning prefill is prepended to model output before parsing. +Note: The start marker may be empty either because the analyzer detected delimiter-style reasoning, or because `generate_parser()` cleared a template artifact start marker (see Generation Prompt & Reasoning Prefill above). Whitespace-only reasoning content (e.g. from a `` prefill) is discarded by the mapper. #### Content Parser (`analyze_content::build_parser`) @@ -417,9 +420,7 @@ All three tool parsers return: reasoning + optional(content(until(trigger_marker))) + tool_calls + end() ``` -### Python Dict Format - -When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule. +Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepends a literal for any boilerplate prefix of the generation prompt (the portion before the reasoning start marker). ## Mapper @@ -428,22 +429,22 @@ When `format.uses_python_dicts` is true (detected when single-quoted strings app - **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments` - **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching - **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format) -- **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`) +- **Whitespace-only reasoning**: Reasoning content that consists entirely of whitespace (e.g. from a `` prefill) is cleared so the message shows no reasoning - **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically ## Files -| File | Purpose | -|-------------------------------------------|----------------------------------------------------------------------| -| `common/chat-auto-parser.h` | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` | -| `common/chat-auto-parser-generator.cpp` | Parser generator: `generate_parser()` and `build_parser()` methods | -| `common/chat-diff-analyzer.cpp` | Differential analysis implementation and workarounds | -| `common/chat-auto-parser-helpers.h/cpp` | `calculate_diff_split()`, `segmentize_markers()`, | -| | `compare_variants()`, string helpers | -| `common/chat-peg-parser.h/cpp` | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers | -| `common/chat.cpp` | Entry point: `common_chat_templates_apply_jinja()` | -| `tools/parser/debug-template-parser.cpp` | Debug tool for template analysis | -| `tools/parser/template-analysis.cpp` | Template analysis tool | +| File | Purpose | +|-------------------------------------------|---------------------------------------------------------------------------------| +| `common/chat-auto-parser.h` | All analysis structs, enums, `autoparser`, `peg_generator`, `generation_params` | +| `common/chat-auto-parser-generator.cpp` | Parser generator: `generate_parser()` and `build_parser()` methods | +| `common/chat-diff-analyzer.cpp` | Differential analysis implementation and workarounds | +| `common/chat-auto-parser-helpers.h/cpp` | `calculate_diff_split()`, `segmentize_markers()`, `compare_variants()`, | +| | `wrap_for_generation_prompt()`, string helpers | +| `common/chat-peg-parser.h/cpp` | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers | +| `common/chat.cpp` | Entry point: `common_chat_templates_apply_jinja()` | +| `tools/parser/debug-template-parser.cpp` | Debug tool for template analysis | +| `tools/parser/template-analysis.cpp` | Template analysis tool | ## Testing & Debugging @@ -523,10 +524,10 @@ To support a new template format: ## Edge Cases and Quirks -1. **Reasoning Prefill**: See the `reasoning_mode` enum section above for the full description. Key detail: the generation prompt suffix is extracted via `compare_variants(add_generation_prompt=false, add_generation_prompt=true)` to avoid false positives from prior conversation turns. +1. **Generation Prompt & Reasoning Prefill**: The generation prompt is extracted by diffing `add_generation_prompt=false` vs `true` in `common_chat_templates_apply_jinja`, so it contains exactly what the template appends — avoiding false positives from prior conversation turns. 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker. -3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built. -4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `` or `[marker]` tokens, ensuring clean extraction. -5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case. -6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`. -7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats. +3. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `` or `[marker]` tokens, ensuring clean extraction. +4. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case. +5. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`. +6. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats. +7. **Undetected Tool Format**: If `analyze_tools` concludes tool calling is supported but cannot determine the format, `build_parser()` logs an error and returns `eps()` (graceful degradation) rather than aborting. diff --git a/tools/server/README.md b/tools/server/README.md index 90319a7ee7..25ccad8b8c 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat "chat_format": "GPT-OSS", "reasoning_format": "none", "reasoning_in_content": false, - "prefill": "", + "generation_prompt": "", "samplers": [ "penalties", "dry", @@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat "chat_format": "GPT-OSS", "reasoning_format": "none", "reasoning_in_content": false, - "prefill": "", + "generation_prompt": "", "samplers": [ "penalties", "dry", @@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text. -`prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes. +`generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing. `parse_tool_calls`: Whether to parse the generated tool call. diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts index daad0189e8..a4c32d8683 100644 --- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts +++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts @@ -51,7 +51,7 @@ describe('ParameterSyncService', () => { chat_format: '', reasoning_format: '', reasoning_in_content: false, - prefill: '', + generation_prompt: '', 'speculative.n_max': 0, 'speculative.n_min': 0, 'speculative.p_min': 0.0, diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts index 566ff8ce14..76e1f77320 100644 --- a/tools/server/webui/src/lib/types/api.d.ts +++ b/tools/server/webui/src/lib/types/api.d.ts @@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps { chat_format: string; reasoning_format: string; reasoning_in_content: boolean; - prefill: string; + generation_prompt: string; samplers: string[]; backend_sampling: boolean; 'speculative.n_max': number;