679 changed files with 37765 additions and 60758 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -93,7 +93,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main -E "test-llama-archs" --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  macOS-latest-cmake-x64:
    runs-on: macos-15-intel
@ -469,7 +469,6 @@ jobs:
          cd build
          export GGML_VK_VISIBLE_DEVICES=0
          export GGML_VK_DISABLE_F16=1
-          export GGML_VK_DISABLE_COOPMAT=1
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 4800

@ -1727,22 +1726,6 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-x64-linux-intel-vulkan:
-    runs-on: [self-hosted, Linux, X64, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          persist-credentials: false
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
  ggml-ci-arm64-cpu-kleidiai:
     runs-on: ubuntu-22.04-arm

--- a/5
+++ b/5
@ -11,8 +11,6 @@
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
 /common/chat.*                          @pwilkin
-/common/chat-auto*.*                    @pwilkin
-/common/chat-diff-analyzer.*            @pwilkin
 /common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
@ -91,13 +89,12 @@
 /src/llama-vocab.*                      @CISC
 /src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-chat.*                      @pwilkin
+/tests/test-chat-.*                     @pwilkin
 /tools/batched-bench/                   @ggerganov
 /tools/cli/                             @ngxson
 /tools/completion/                      @ggerganov
 /tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
-/tools/parser/                          @pwilkin
 /tools/quantize/                        @ggerganov
 /tools/rpc/                             @rgerganov
 /tools/server/*                         @ngxson @ggerganov # no subdir
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -39,7 +39,6 @@ Before submitting your PR:
    - For intricate features, consider opening a feature request first to discuss and align expectations
    - When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
@ -160,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r

 # Code maintenance

- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
  - Reviewing and merging related PRs
  - Fixing related bugs
  - Providing developer guidance/support
--- a/README.md
+++ b/README.md
@ -259,8 +259,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
 - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
 - [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
- [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
-  support"
 </details>

 <details>
@ -289,7 +287,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
+| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |

 ## Obtaining and quantizing models
--- a/benches/nemotron/nemotron-dgx-spark.md
+++ b/benches/nemotron/nemotron-dgx-spark.md
@ -1,72 +0,0 @@
-# NVIDIA DGX Spark
-
-## System info
-
-```bash
-uname --all
-Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
-
-g++ --version
-g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
-
-nvidia-smi
-Fri Mar  6 11:39:45 2026
-+-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
-+-----------------------------------------+------------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
-|                                         |                        |               MIG M. |
-|=========================================+========================+======================|
-|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
-| N/A   52C    P0             13W /  N/A  | Not Supported          |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-```
-
-## ggml-org/nemotron-3-super-120b-GGUF
-
-Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
-
- `llama-batched-bench`
-
-main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    1.094 |   468.05 |    1.621 |    19.74 |    2.715 |   200.37 |
-|   512 |     32 |    2 |   1088 |    1.463 |   700.16 |    2.437 |    26.26 |    3.900 |   279.01 |
-|   512 |     32 |    4 |   2176 |    2.647 |   773.76 |    4.043 |    31.66 |    6.689 |   325.29 |
-|   512 |     32 |    8 |   4352 |    5.291 |   774.14 |    6.151 |    41.62 |   11.442 |   380.37 |
-|   512 |     32 |   16 |   8704 |   10.603 |   772.62 |   10.385 |    49.30 |   20.987 |   414.72 |
-|   512 |     32 |   32 |  17408 |   21.231 |   771.69 |   18.235 |    56.16 |   39.466 |   441.09 |
-|  4096 |     32 |    1 |   4128 |    5.340 |   767.05 |    1.616 |    19.81 |    6.956 |   593.47 |
-|  4096 |     32 |    2 |   8256 |   10.673 |   767.55 |    2.454 |    26.08 |   13.127 |   628.94 |
-|  4096 |     32 |    4 |  16512 |   21.348 |   767.46 |    4.072 |    31.44 |   25.420 |   649.57 |
-|  4096 |     32 |    8 |  33024 |   42.714 |   767.15 |    6.277 |    40.78 |   48.991 |   674.08 |
-|  4096 |     32 |   16 |  66048 |   85.385 |   767.54 |   10.596 |    48.32 |   95.981 |   688.14 |
-|  4096 |     32 |   32 | 132096 |  170.819 |   767.32 |   18.619 |    55.00 |  189.437 |   697.31 |
-|  8192 |     32 |    1 |   8224 |   10.690 |   766.32 |    1.619 |    19.76 |   12.310 |   668.10 |
-|  8192 |     32 |    2 |  16448 |   21.382 |   766.24 |    2.467 |    25.94 |   23.850 |   689.65 |
-|  8192 |     32 |    4 |  32896 |   42.782 |   765.92 |    4.098 |    31.23 |   46.881 |   701.69 |
-|  8192 |     32 |    8 |  65792 |   85.582 |   765.77 |    6.368 |    40.20 |   91.951 |   715.52 |
-|  8192 |     32 |   16 | 131584 |  171.066 |   766.21 |   10.774 |    47.52 |  181.840 |   723.62 |
-|  8192 |     32 |   32 | 263168 |  342.140 |   766.19 |   18.969 |    53.98 |  361.109 |   728.78 |
-
-
- `llama-bench`
-
-| model                   |       size |     params | backend    | n_ubatch | fa |            test |                  t/s |
-| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |          pp2048 |        768.84 ± 0.90 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |            tg32 |         19.94 ± 0.16 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d4096 |        764.51 ± 0.50 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d4096 |         19.95 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d8192 |        759.53 ± 0.71 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d8192 |         19.83 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d16384 |        747.98 ± 1.58 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d16384 |         19.84 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d32768 |        724.40 ± 2.70 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d32768 |         19.45 ± 0.18 |
-
-build: 04a65daab (8268)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -47,10 +47,10 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat-auto-parser-generator.cpp
-    chat-auto-parser-helpers.cpp
-    chat-auto-parser.h
-    chat-diff-analyzer.cpp
+    chat-parser.cpp
+    chat-parser.h
+    chat-parser-xml-toolcall.h
+    chat-parser-xml-toolcall.cpp
    chat-peg-parser.cpp
    chat-peg-parser.h
    chat.cpp
@ -81,8 +81,6 @@ add_library(${TARGET} STATIC
    preset.cpp
    preset.h
    regex-partial.cpp
-    reasoning-budget.cpp
-    reasoning-budget.h
    regex-partial.h
    sampling.cpp
    sampling.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -732,28 +732,23 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-completion",
        "llama-convert-llama2c-to-ggml",
        "llama-cvector-generator",
-        "llama-debug",
-        "llama-diffusion-cli",
        "llama-embedding",
        "llama-eval-callback",
        "llama-export-lora",
-        "llama-finetune",
-        "llama-fit-params",
-        "llama-gemma3-cli",
        "llama-gen-docs",
        "llama-gguf",
        "llama-gguf-hash",
        "llama-gguf-split",
-        "llama-idle",
+        "llama-gritlm",
        "llama-imatrix",
-        "llama-llava-cli",
+        "llama-infill",
+        "llama-mtmd-cli",
+        "llama-llava-clip-quantize-cli",
        "llama-lookahead",
        "llama-lookup",
        "llama-lookup-create",
        "llama-lookup-merge",
        "llama-lookup-stats",
-        "llama-minicpmv-cli",
-        "llama-mtmd-cli",
        "llama-parallel",
        "llama-passkey",
        "llama-perplexity",
@ -1284,20 +1279,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
-        {"-ctxcp", "--ctx-checkpoints", "--swa-checkpoints"}, "N",
+        {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
        string_format("max number of context checkpoints to create per slot (default: %d)"
            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
        [](common_params & params, int value) {
            params.n_ctx_checkpoints = value;
        }
    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
-    add_opt(common_arg(
-        {"-cpent", "--checkpoint-every-n-tokens"}, "N",
-        string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
-        [](common_params & params, int value) {
-            params.checkpoint_every_nt = value;
-        }
-    ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"-cram", "--cache-ram"}, "N",
        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
@ -2411,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.fit_params = false;
            } else {
                throw std::runtime_error(
-                    string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
+                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
            }
        }
    ).set_env("LLAMA_ARG_FIT"));
@ -2432,11 +2420,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                );
            }
            if (split_arg.size() == 1) {
-                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
+                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
                return;
            }
            for (size_t i = 0; i < split_arg.size(); i++) {
-                params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
+                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
            }
        }
    ).set_env("LLAMA_ARG_FIT_TARGET"));
@ -2671,8 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
-                    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@ -2840,14 +2827,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.webui_config_json = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
-    add_opt(common_arg(
-        {"--webui-mcp-proxy"},
-        {"--no-webui-mcp-proxy"},
-        string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.webui_mcp_proxy = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
@ -2919,10 +2898,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            auto parsed = json::parse(value);
            for (const auto & item : parsed.items()) {
-                if (item.key() == "enable_thinking") {
-                    LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
-                            "Use --reasoning on / --reasoning off instead.\n");
-                }
                params.default_template_kwargs[item.key()] = item.value().dump();
            }
        }
@ -3058,39 +3033,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.reasoning_format = common_reasoning_format_from_name(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
-    add_opt(common_arg(
-        {"-rea", "--reasoning"}, "[on|off|auto]",
-        "Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
-        [](common_params & params, const std::string & value) {
-            if (is_truthy(value)) {
-                params.enable_reasoning = 1;
-                params.default_template_kwargs["enable_thinking"] = "true";
-            } else if (is_falsey(value)) {
-                params.enable_reasoning = 0;
-                params.default_template_kwargs["enable_thinking"] = "false";
-            } else if (is_autoy(value)) {
-                params.enable_reasoning = -1;
-            } else {
-                throw std::invalid_argument(
-                    string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
    add_opt(common_arg(
        {"--reasoning-budget"}, "N",
-        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
+        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
        [](common_params & params, int value) {
-            if (value < -1) { throw std::invalid_argument("invalid value"); }
+            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
            params.reasoning_budget = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
-    add_opt(common_arg(
-        {"--reasoning-budget-message"}, "MESSAGE",
-        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
-        [](common_params & params, const std::string & value) {
-            params.reasoning_budget_message = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@ -3642,13 +3592,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg(
-        {"--check"},
-        string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
-        [](common_params & params) {
-            params.check = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_RESULTS}));
    add_opt(common_arg(
        {"--save-logits"},
        string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -1,450 +0,0 @@
-#include "chat-auto-parser.h"
-#include "chat-peg-parser.h"
-#include "chat.h"
-#include "common.h"
-#include "json-schema-to-grammar.h"
-#include "nlohmann/json.hpp"
-
-#include <stdexcept>
-#include <string>
-
-using json = nlohmann::ordered_json;
-
-// Helper to iterate over tools/functions
-static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
-    for (const auto & tool : tools) {
-        if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
-            continue;
-        }
-        fn(tool);
-    }
-}
-
-namespace autoparser {
-
-parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
-    p(p),
-    inputs(inputs),
-    reasoning_parser(p.eps()) {}
-
-common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct templates_params & inputs) {
-    // Run differential analysis to extract template structure
-    struct autoparser autoparser;
-    autoparser.analyze_template(tmpl);
-    return generate_parser(tmpl, inputs, autoparser);
-}
-
-common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct templates_params & inputs,
-                                                  const autoparser &              autoparser) {
-    // Build the parser using the analysis results
-    auto parser = autoparser.build_parser(inputs);
-
-    // Create the result structure
-    common_chat_params data;
-    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens = autoparser.preserved_tokens;
-    data.parser           = parser.save();
-
-    // Build grammar if tools are present
-    bool has_tools =
-        autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
-    std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
-                                                                                  autoparser.tools.format.per_call_start;
-
-    bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
-    bool include_grammar = has_response_format || (has_tools &&
-            ((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
-              inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
-
-    if (include_grammar) {
-        data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-
-        // Set grammar triggers based on tool section markers (fall back to per-call markers)
-        if (data.grammar_lazy) {
-            data.grammar_triggers = {
-                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
-            };
-        }
-    }
-
-    return data;
-}
-
-common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
-    if (!analysis_complete) {
-        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
-    }
-    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        // If the template uses Python dict format (single-quoted strings in JSON structures),
-        // pre-register a json-string rule that accepts both quote styles. This must happen
-        // before any call to p.json() so that all JSON parsing inherits the flexible rule.
-        if (tools.format.uses_python_dicts) {
-            p.rule("json-string", p.quoted_string());
-        }
-
-        parser_build_context ctx(p, inputs);
-        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-        bool                 enable_thinking   = inputs.enable_thinking;
-
-        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
-        ctx.content              = &content;
-
-        // Build reasoning parser
-        ctx.reasoning_parser = reasoning.build_parser(ctx);
-
-        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
-        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
-
-        if (has_response_format) {
-            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-            return ctx.reasoning_parser + p.space() + p.choice({
-                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
-                response_format
-            }) + p.end();
-        }
-
-        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
-            return tools.build_parser(ctx);
-        }
-
-        return content.build_parser(ctx);
-    });
-}
-
-common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (!ctx.extracting_reasoning) {
-        return p.eps();
-    }
-
-    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
-    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
-    if (thinking_forced_open || thinking_forced_closed) {
-        // Thinking is forced open OR forced closed with enable_thinking=true
-        // In both cases, expect only the closing tag (opening was in template)
-        // However, since we might have incorrectly detected the open/close pattern,
-        // we admit an optional starting marker
-        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
-    }
-    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
-        // Both use the same tag-based pattern if markers are available
-        if (!start.empty() && !end.empty()) {
-            return p.optional(start + p.reasoning(p.until(end)) + end);
-        }
-    } else if (mode == reasoning_mode::DELIMITER) {
-        return p.optional(p.reasoning(p.until(end)) + end);
-    }
-
-    return p.eps();
-}
-
-common_peg_parser analyze_content::build_parser(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (is_always_wrapped()) {
-        if (ctx.extracting_reasoning) {
-            return ctx.reasoning_parser + start + p.content(p.until(end)) + end + p.end();
-        }
-        return p.content(p.until(start)) + start + p.content(p.until(end)) + end + p.end();
-    }
-    return ctx.reasoning_parser + p.content(p.rest()) + p.end();
-}
-
-common_peg_parser analyze_content::build_optional_wrapped(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (is_always_wrapped()) {
-        return p.optional(start + p.content(p.until(end)) + end);
-    }
-    return p.eps();
-}
-
-common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const {
-    switch (format.mode) {
-        case tool_format::JSON_NATIVE:
-            return build_tool_parser_json_native(ctx);
-        case tool_format::TAG_WITH_JSON:
-            return build_tool_parser_tag_json(ctx);
-        case tool_format::TAG_WITH_TAGGED:
-            return build_tool_parser_tag_tagged(ctx);
-        default:
-            GGML_ABORT("Unable to create tool parser");
-    }
-}
-
-common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    // Build effective field names with dot notation if function_field is set
-    std::string name_field = format.name_field;
-    std::string args_field = format.args_field;
-
-    if (!format.function_field.empty() && format.function_field != "function" &&
-        name_field.find('.') == std::string::npos) {
-        name_field = format.function_field + "." + name_field;
-        args_field = format.function_field + "." + args_field;
-    }
-
-    auto tools_parser = p.standard_json_tools(
-        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
-        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
-
-    // Handle content wrappers if present
-    if (ctx.content && ctx.content->is_always_wrapped()) {
-        auto wrapped_content = ctx.content->build_optional_wrapped(ctx);
-        return ctx.reasoning_parser + wrapped_content + tools_parser + p.end();
-    }
-
-    std::string tool_start = "{";
-    if (!format.section_start.empty()) {
-        tool_start = format.section_start;
-    } else if (!format.per_call_start.empty()) {
-        tool_start = format.per_call_start;
-    }
-
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
-           p.end();
-}
-
-common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & schema = func.at("parameters");
-
-        // Build call_id parser based on position (if supported)
-        common_peg_parser call_id_section = p.eps();
-        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            !call_id.suffix.empty()) {
-            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
-        }
-
-        auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                           call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
-        if (!function.close.empty()) {
-            func_parser = func_parser + function.close;
-        }
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    auto require_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_calls = p.eps();
-
-    if (!format.per_call_start.empty()) {
-        auto wrapped_call = format.per_call_start + tool_choice + format.per_call_end;
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
-        }
-        if (!format.section_start.empty()) {
-            tool_calls = p.trigger_rule("tool-calls",
-                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
-        }
-    } else {
-        std::string separator = ", ";  // Default
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice +
-                                                         p.zero_or_more(separator + tool_choice) + format.section_end);
-        } else {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice + format.section_end);
-        }
-    }
-
-    if (!require_calls) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
-    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
-}
-
-common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & params = func.at("parameters");
-
-        if (!params.contains("properties") || !params.at("properties").is_object()) {
-            return;
-        }
-
-        const auto &          properties = params.at("properties");
-        std::set<std::string> required;
-        if (params.contains("required") && params.at("required").is_array()) {
-            params.at("required").get_to(required);
-        }
-
-        // Build parser for each argument, separating required and optional
-        std::vector<common_peg_parser> required_parsers;
-        std::vector<common_peg_parser> optional_parsers;
-        for (const auto & [param_name, param_schema] : properties.items()) {
-            bool        is_required = required.find(param_name) != required.end();
-            std::string type        = "object";
-            auto        type_obj    = param_schema.contains("type") ? param_schema.at("type") : json::object();
-            if (type_obj.is_string()) {
-                type_obj.get_to(type);
-            } else if (type_obj.is_object()) {
-                if (type_obj.contains("type") && type_obj.at("type").is_string()) {
-                    type_obj.at("type").get_to(type);
-                }
-            }
-
-            auto arg = p.tool_arg(
-                p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
-                                arguments.name_suffix) +
-                arguments.value_prefix +
-                (type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
-                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                     param_schema, true)) :
-                                    p.tool_arg_json_value(p.schema(
-                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
-                                        p.space()) +
-                p.tool_arg_close(p.literal(arguments.value_suffix)));
-
-            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
-            if (is_required) {
-                required_parsers.push_back(named_arg);
-            } else {
-                optional_parsers.push_back(named_arg);
-            }
-        }
-
-        // Build required arg sequence in definition order
-        common_peg_parser args_seq = p.eps();
-        for (size_t i = 0; i < required_parsers.size(); i++) {
-            if (i > 0) {
-                args_seq = args_seq + p.space();
-            }
-            args_seq = args_seq + required_parsers[i];
-        }
-
-        // Build optional args with flexible ordering
-        if (!optional_parsers.empty()) {
-            common_peg_parser any_opt = p.choice();
-            for (const auto & opt : optional_parsers) {
-                any_opt |= opt;
-            }
-            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
-        }
-
-        // Build call_id parser based on position (if supported)
-        common_peg_parser call_id_section = p.eps();
-        bool have_call_id = false;
-        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            !call_id.suffix.empty()) {
-            have_call_id = true;
-            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
-        }
-
-        bool matched_atomic = false;
-        common_peg_parser func_parser = p.eps();
-        if (!function.name_suffix.empty()) {
-            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + args_seq;
-            matched_atomic = true;
-        } else if (have_call_id) {
-            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section) + p.space() + args_seq;
-            matched_atomic = true;
-        } else if (!arguments.name_prefix.empty() && properties.size() > 0) {
-            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
-            matched_atomic = true;
-        } else {
-            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + args_seq;
-        }
-
-        if (!function.close.empty()) {
-            func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
-        } else if (!format.per_call_end.empty()) {
-            // When there's no func_close but there is a per_call_end marker, use peek() to ensure
-            // we only emit tool_close when we can actually see the closing marker. This prevents
-            // premature closing during partial parsing when we've seen e.g. "</" which could be
-            // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
-            func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
-        } else {
-            func_parser =
-                func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
-        }
-        if (!matched_atomic) {
-            func_parser = p.atomic(func_parser);
-        }
-
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_calls = p.eps();
-
-    if (!format.per_call_start.empty()) {
-        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
-        }
-        if (!format.section_start.empty()) {
-            tool_calls = p.trigger_rule("tool-calls",
-                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
-        }
-    } else {
-        std::string separator = ", ";  // Default
-
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + p.space() + tool_choice +
-                                                         p.zero_or_more(separator + tool_choice) + p.space() +
-                                                         format.section_end);
-        } else {
-            tool_calls = p.trigger_rule(
-                "tool-call", format.section_start + p.space() + tool_choice + p.space() + format.section_end);
-        }
-    }
-
-    if (!require_tools) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
-    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
-}
-
-}  // namespace autoparser
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@ -1,347 +0,0 @@
-#include "chat-auto-parser-helpers.h"
-
-#include "chat-auto-parser.h"
-#include "chat.h"
-#include "log.h"
-#include "nlohmann/json.hpp"
-
-#include <cctype>
-#include <numeric>
-
-using json = nlohmann::ordered_json;
-
-std::string trim_whitespace(const std::string & str) {
-    size_t start = 0;
-    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
-        start++;
-    }
-
-    if (start == str.length()) {
-        return "";
-    }
-
-    size_t end = str.length() - 1;
-    while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
-        end--;
-    }
-
-    return str.substr(start, end - start + 1);
-}
-
-std::string trim_leading_whitespace(const std::string & str) {
-    size_t start = 0;
-    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
-        start++;
-    }
-
-    return str.substr(start);
-}
-
-std::string trim_trailing_whitespace(const std::string & str) {
-    if (str.empty()) {
-        return "";
-    }
-
-    size_t end = str.length() - 1;
-    while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
-        end--;
-    }
-
-    // If first char is also whitespace, return empty string
-    if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
-        return "";
-    }
-
-    return str.substr(0, end + 1);
-}
-
-std::string trim_trailing_newlines(const std::string & str) {
-    size_t end = str.length();
-    while (end > 0 && str[end - 1] == '\n') {
-        end--;
-    }
-
-    return str.substr(0, end);
-}
-
-static size_t common_prefix_len(const std::string & left, const std::string & right) {
-    size_t prefix_len = 0;
-    size_t min_len    = std::min(left.length(), right.length());
-    while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
-        prefix_len++;
-    }
-    return prefix_len;
-}
-
-static size_t common_suffix_len(const std::string & left, const std::string & right) {
-    size_t suffix_len = 0;
-    size_t min_len    = std::min(left.length(), right.length());
-    while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
-        suffix_len++;
-    }
-    return suffix_len;
-}
-
-diff_split calculate_diff_split(const std::string & left, const std::string & right) {
-    diff_split result;
-
-    auto left_seg = segmentize_markers(left);
-    auto right_seg = segmentize_markers(right);
-
-    if (left_seg.empty()) {
-        result.right = right;
-        return result;
-    }
-    if (right_seg.empty()) {
-        result.left = left;
-        return result;
-    }
-
-    auto left_start = left_seg.begin();
-    auto left_end = --left_seg.end();
-    auto right_start = right_seg.begin();
-    auto right_end = --right_seg.end();
-
-    auto test = [&] () {
-        return left_start != left_end && right_start != right_end;
-    };
-
-    bool left_fully_consumed = false;
-    bool right_fully_consumed = false;
-
-    while (test()) {
-        bool advanced = false;
-        if (*left_start == *right_start) {
-            result.prefix.append(left_start->value);
-            left_start++;
-            right_start++;
-            advanced = true;
-        }
-        if (*left_end == *right_end) {
-            result.suffix = left_end->value + result.suffix;
-            if (left_start != left_end) {
-                left_end--;
-            } else {
-                left_fully_consumed = true;
-            }
-            if (right_start != right_end) {
-                right_end--;
-            } else {
-                right_fully_consumed = true;
-            }
-            advanced = true;
-        }
-        if (!advanced) {
-            break;
-        }
-    }
-
-    if (left_start == left_end && right_start != right_end) {
-        if (*left_start == *right_end) {
-            result.suffix = right_end->value + result.suffix;
-            right_end--;
-            left_fully_consumed = true;
-        } else if (*left_start == *right_start) {
-            result.prefix.append(right_start->value);
-            right_start++;
-            left_fully_consumed = true;
-        }
-    } else if (right_start == right_end && left_start != left_end) {
-        if (*left_end == *right_start) {
-            result.suffix = left_end->value + result.suffix;
-            left_end--;
-            right_fully_consumed = true;
-        } else if (*left_start == *right_start) {
-            result.prefix.append(left_start->value);
-            left_start++;
-            right_fully_consumed = true;
-        }
-    } else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
-        result.prefix.append(right_start->value);
-        left_fully_consumed = true;
-        right_fully_consumed = true;
-    }
-
-    auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };
-
-    bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
-    bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
-
-    std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
-    std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
-
-    size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
-    // avoid overlaps between prefix and suffix
-    size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
-        remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
-
-    result.prefix.append(remainder_left.substr(0, prefix_len));
-    result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
-    result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
-    result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
-
-    if (result.left == "" && result.right == "") {
-        // degenerate case, no diff
-        result.prefix = left;
-        result.suffix = "";
-        // pick prefix = all as representation
-    }
-    return result;
-}
-
-// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
-std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
-    // Find the common prefix of left and right
-    size_t common_prefix_len = 0;
-    size_t min_len           = std::min(left.length(), right.length());
-    while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
-        common_prefix_len++;
-    }
-
-    // If there's no common prefix, return empty string
-    if (common_prefix_len == 0) {
-        return "";
-    }
-
-    // Find the common prefix in the full string
-    std::string common_prefix = left.substr(0, common_prefix_len);
-    size_t      pos           = full.find(common_prefix);
-
-    // If not found, return empty string
-    if (pos == std::string::npos) {
-        return "";
-    }
-
-    // Return everything before the common prefix
-    return full.substr(0, pos);
-}
-
-// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
-std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
-    // Find the common suffix of left and right (compare from the end)
-    size_t common_suffix_len = 0;
-    size_t min_len           = std::min(left.length(), right.length());
-    while (common_suffix_len < min_len &&
-           left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
-        common_suffix_len++;
-    }
-
-    // If there's no common suffix, return empty string
-    if (common_suffix_len == 0) {
-        return "";
-    }
-
-    // Extract the common suffix
-    std::string common_suffix = left.substr(left.length() - common_suffix_len);
-
-    // Find the last occurrence of the common suffix in the full string
-    size_t pos = full.rfind(common_suffix);
-
-    // If not found, return empty string
-    if (pos == std::string::npos) {
-        return "";
-    }
-
-    // Return everything after the common suffix
-    return full.substr(pos + common_suffix_len);
-}
-
-// TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
-// not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
-// Might have to put some restrictions on tag contents as well (like "no { }")
-std::vector<segment> segmentize_markers(const std::string & text) {
-    std::vector<segment> retval;
-    bool in_marker = false;
-    char marker_opener = '\0';
-
-    auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
-    auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
-
-    size_t last_border = 0;
-
-    for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
-        if (!in_marker && is_marker_opener(text[cur_pos])) {
-            if (last_border < cur_pos) {
-                retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
-            }
-            last_border = cur_pos;
-            in_marker = true;
-            marker_opener = text[cur_pos];
-        } else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
-            // no need to check because last_border will always be smaller
-                retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
-            last_border = cur_pos + 1;
-            in_marker = false;
-            marker_opener = '\0';
-        }
-    }
-    if (last_border < text.length()) {
-            retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
-    }
-    return retval;
-}
-
-std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
-    std::vector<segment> result;
-    for (const auto & seg : segments) {
-        if (!trim_whitespace(seg.value).empty()) {
-            result.push_back(seg);
-        }
-    }
-    return result;
-}
-
-namespace autoparser {
-
-std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
-    templates_params tmpl_params;
-    tmpl_params.messages              = params.messages;
-    tmpl_params.tools                 = params.tools;
-    tmpl_params.add_generation_prompt = params.add_generation_prompt;
-    tmpl_params.enable_thinking       = params.enable_thinking;
-
-    if (params.extra_context) {
-        tmpl_params.extra_context = *params.extra_context;
-    }
-    tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
-
-    try {
-        return common_chat_template_direct_apply(tmpl, tmpl_params);
-    } catch (const std::exception & e) {
-        LOG_DBG("Template application failed: %s\n", e.what());
-        return "";
-    }
-}
-
-std::optional<compare_variants_result> compare_variants(
-    const common_chat_template &                   tmpl,
-    const template_params &                        params_A,
-    const std::function<void(template_params &)> & params_modifier) {
-    // Create variant B by copying A
-    template_params params_B = params_A;
-
-    // Apply modifier to create variant B
-    if (params_modifier) {
-        params_modifier(params_B);
-    }
-
-    // Apply template to both variants
-    std::string output_A = apply_template(tmpl, params_A);
-    std::string output_B = apply_template(tmpl, params_B);
-
-    // Check for template application failures
-    if (output_A.empty() || output_B.empty()) {
-        return std::nullopt;
-    }
-
-    // Calculate diff and return result with both outputs
-    compare_variants_result result;
-    result.diff     = calculate_diff_split(output_A, output_B);
-    result.output_A = output_A;
-    result.output_B = output_B;
-
-    return result;
-}
-
-}  // namespace autoparser
-
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@ -1,73 +0,0 @@
-#pragma once
-
-#include "chat-auto-parser.h"
-#include <functional>
-#include <optional>
-#include <string>
-
-std::string trim_whitespace(const std::string & str);
-std::string trim_leading_whitespace(const std::string & str);
-std::string trim_trailing_whitespace(const std::string & str);
-std::string trim_trailing_newlines(const std::string & str);
-
-// calculate a diff split (longest common prefix, longest common suffix excluding prefix,
-// mismatched part on the left, mismatched part on the right) between two strings
-// account for markers - align prefix and suffix endings so that they end on markers
-// * eg.:
-// calculate_diff_split("<html><body><div></div></body></html>", "<html><body><p>Something</p></body><html>") ->
-//  { "prefix": "<html><body>" (not: "<html><body><"), "suffix": "</body></html>", "left": "<div></div>", "right": "<p>Something</p>" }
-// calculate_diff_split("<html><body>Something</body></html>", "<html><body></body><html>") ->
-//  { "prefix": "<html><body>", "suffix": "</body></html>", "left": "Something", "right": "" }
-diff_split calculate_diff_split(const std::string & left, const std::string & right);
-
-// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
-// Returns empty string if there's no common prefix
-// * eg.:
-// until_common_prefix("really want a FUNCTION call", "FUNCTION alpha", "FUNCTION beta") -> "really want a "
-// until_common_prefix("<tool_call>", "<something>", "<something_else>") -> ""
-// until_common_prefix("some text", "1234", "abcd") -> ""
-// until_common_prefix("one arg two args three args four", "argument alpha", "argument beta") -> "one ""
-std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right);
-
-// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
-// Returns empty string if there's no common suffix
-// Mirror function of `until_common_prefix`
-// * eg.:
-// after_common_suffix("really want a FUNCTION call", "first FUNCTION", "second FUNCTION") -> " call"
-// after_common_suffix("one arg two-args three args four", "alpha-args", "beta-args") -> " three args four"
-std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right);
-
-// Segmentize text into markers and non-marker fragments
-// * eg.:
-// segmentize_markers("<html><head><title>The site title</title><body><div>Here's some <b>content</b></div></body></html>" ->
-//  [ (MARKER, "<html>"), (MARKER, "<head>"), (MARKER, "<title>"), (TEXT, "The site title"), (MARKER, "</title>"),
-//    (MARKER, "<body>"), (MARKER, "<div>"), (TEXT, "Here's some "), (MARKER, "<b>"), (TEXT, "content"), (MARKER, "</b>"),
-//    (MARKER, "</div>"), (MARKER, "</body>"), (MARKER, "</html>")
-//  ]
-// segmentize_markers("<|tool_call|>[args]{ are here }[/args]<|tool_call_end|>") ->
-//  [ (MARKER, "<|tool_call|>"), (MARKER, "[args]"), (TEXT, "{ are here }"), (MARKER, "[/args]"), (MARKER, "<|tool_call_end|>") ]
-std::vector<segment> segmentize_markers(const std::string & text);
-
-// Prune whitespace-only segments from a vector of segments
-// * eg.:
-// segmentize_markers("<tool_call>\n<function=foo>\n<arg=bar>\n   \n</arg>\n</function>\n</tool_call>") ->
-//  X = [ (MARKER, "<tool_call>"), (TEXT, "\n"), (MARKER, "<function=foo>"), (TEXT, "\n"), (MARKER, "<arg=bar>"), (TEXT, "\n   \n"),
-//        (MARKER, "</arg>"), (TEXT, "\n"), (MARKER, "</function>"), (TEXT, "\n"), (MARKER, "</tool_call>") ]
-// prune_whitespace_segments(X) -> [ (MARKER, "<tool_call>"), (MARKER, "<function=foo>"), (MARKER, "<arg=bar>"), (MARKER, "</arg>"),
-//                                   (MARKER, "</function>"), (MARKER, "</tool_call>") ]
-std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
-
-namespace autoparser {
-
-// Apply a template with the given parameters, returning the rendered string (empty on failure)
-std::string apply_template(const common_chat_template & tmpl, const template_params & params);
-
-// Factorized differential comparison function
-// Takes base params and a single modifier lambda to create variant B
-// Returns compare_variants_result containing diff and both outputs, or std::nullopt on failure
-std::optional<compare_variants_result> compare_variants(
-    const common_chat_template &                   tmpl,
-    const template_params &                        params_A,
-    const std::function<void(template_params &)> & params_modifier);
-
-}  // namespace autoparser
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -1,433 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "common.h"
-#include "jinja/caps.h"
-#include "peg-parser.h"
-
-#include <chrono>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-class common_chat_peg_builder;
-
-// ============================================================================
-// Parameters for template application (low-level, used by diff analysis)
-// ============================================================================
-struct template_params {
-    json                messages;
-    json                tools;
-    bool                add_generation_prompt = false;
-    bool                enable_thinking       = true;
-    std::optional<json> extra_context         = std::nullopt;
-};
-
-struct diff_split {
-    std::string prefix;
-    std::string suffix;
-    std::string left;
-    std::string right;
-
-    bool operator==(struct diff_split & other) const {
-        return prefix == other.prefix && suffix == other.suffix && left == other.left && right == other.right;
-    }
-};
-
-// Result of compare_variants containing diff and original outputs
-struct compare_variants_result {
-    diff_split  diff;
-    std::string output_A;
-    std::string output_B;
-};
-
-namespace autoparser {
-
-// ============================================================================
-// High-level params for parser generation
-// ============================================================================
-
-struct templates_params {
-    json                                  messages;
-    json                                  tools;
-    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    json                                  json_schema;
-    bool                                  parallel_tool_calls = true;
-    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
-    bool                                  stream              = true;
-    std::string                           grammar;
-    bool                                  add_generation_prompt = false;
-    bool                                  enable_thinking       = true;
-    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
-    json                                  extra_context;
-    bool                                  add_bos       = false;
-    bool                                  add_eos       = false;
-    bool                                  is_inference  = true;
-    bool                                  add_inference = false;
-    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
-};
-
-// ============================================================================
-// Analysis Result Enums
-// ============================================================================
-
-// Reasoning handling mode (derived from R1-R3 comparisons)
-enum class reasoning_mode {
-    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Standard tag-based: <think>...</think>
-    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
-    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
-    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
-                    // with both opened and closed tag for disabled thinking
-    TOOLS_ONLY      // Only reason on tool calls, not on normal content
-};
-
-inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode) {
-    switch (mode) {
-        case reasoning_mode::NONE:
-            return os << "NONE";
-        case reasoning_mode::TAG_BASED:
-            return os << "TAG_BASED";
-        case reasoning_mode::DELIMITER:
-            return os << "DELIMITER";
-        case reasoning_mode::FORCED_OPEN:
-            return os << "FORCED_OPEN";
-        case reasoning_mode::FORCED_CLOSED:
-            return os << "FORCED_CLOSED";
-        case reasoning_mode::TOOLS_ONLY:
-            return os << "TOOLS_ONLY";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Content wrapping mode (derived from C1 comparison)
-enum class content_mode {
-    PLAIN,                   // No content markers
-    ALWAYS_WRAPPED,          // Content always wrapped with markers
-    WRAPPED_WITH_REASONING,  // Content wrapped only when reasoning present
-};
-
-inline std::ostream & operator<<(std::ostream & os, const content_mode & mode) {
-    switch (mode) {
-        case content_mode::PLAIN:
-            return os << "PLAIN";
-        case content_mode::ALWAYS_WRAPPED:
-            return os << "ALWAYS_WRAPPED";
-        case content_mode::WRAPPED_WITH_REASONING:
-            return os << "WRAPPED_WITH_REASONING";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Call ID position in tool calls (for non-JSON formats)
-enum class call_id_position {
-    NONE,                   // No call ID support detected
-    PRE_FUNC_NAME,          // Call ID before function name: [CALL_ID]id[FUNC]name{args}
-    BETWEEN_FUNC_AND_ARGS,  // Call ID between function and args: [FUNC]name[CALL_ID]id{args}
-    POST_ARGS,              // Call ID after arguments: [FUNC]name{args}[CALL_ID]id
-};
-
-inline std::ostream & operator<<(std::ostream & os, const call_id_position & pos) {
-    switch (pos) {
-        case call_id_position::NONE:
-            return os << "NONE";
-        case call_id_position::PRE_FUNC_NAME:
-            return os << "PRE_FUNC_NAME";
-        case call_id_position::BETWEEN_FUNC_AND_ARGS:
-            return os << "BETWEEN_FUNC_AND_ARGS";
-        case call_id_position::POST_ARGS:
-            return os << "POST_ARGS";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Tool call format classification (derived from T1-T5, A1-A3 comparisons)
-enum class tool_format {
-    NONE,             // No tool support detected
-    JSON_NATIVE,      // Pure JSON: {"name": "X", "arguments": {...}}
-    TAG_WITH_JSON,    // Tag-based with JSON args: <function=X>{...}</function>
-    TAG_WITH_TAGGED,  // Tag-based with tagged args: <param=key>value</param>
-};
-
-inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
-    switch (format) {
-        case tool_format::NONE:
-            return os << "NONE";
-        case tool_format::JSON_NATIVE:
-            return os << "JSON_NATIVE";
-        case tool_format::TAG_WITH_JSON:
-            return os << "TAG_WITH_JSON";
-        case tool_format::TAG_WITH_TAGGED:
-            return os << "TAG_WITH_TAGGED";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// ============================================================================
-// Sub-structs for tool analysis
-// ============================================================================
-
-struct tool_format_analysis {
-    tool_format mode = tool_format::NONE;
-
-    std::string section_start;   // e.g., "<tool_call>", "[TOOL_CALLS]", ""
-    std::string section_end;     // e.g., "</tool_call>", ""
-    std::string per_call_start;  // e.g., "<|tool_call_begin|>", "" (for multi-call templates)
-    std::string per_call_end;    // e.g., "<|tool_call_end|>", ""
-
-    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
-    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
-    bool uses_python_dicts = false;     // Tool call args use Python dict format (single-quoted strings)
-
-    std::string              function_field = "function";
-    std::string              name_field     = "name";
-    std::string              args_field     = "arguments";
-    std::string              id_field;
-    std::string              gen_id_field;
-    std::vector<std::string> parameter_order;
-};
-
-struct tool_function_analysis {
-    std::string name_prefix;  // e.g., "<function=", "\"name\": \"", "functions."
-    std::string name_suffix;  // e.g., ">", "\"", ":0"
-    std::string close;        // e.g., "</function>", "" (for tag-based)
-};
-
-struct tool_arguments_analysis {
-    std::string start;          // e.g., "<|tool_call_argument_begin|>", "<args>"
-    std::string end;            // e.g., "<|tool_call_argument_end|>", "</args>"
-    std::string name_prefix;   // e.g., "<param=", "<arg_key>", "\""
-    std::string name_suffix;   // e.g., ">", "</arg_key>", "\":"
-    std::string value_prefix;  // e.g., "", "<arg_value>", ""
-    std::string value_suffix;  // e.g., "</param>", "</arg_value>", ""
-    std::string separator;     // e.g., "", "\n", ","
-};
-
-struct tool_id_analysis {
-    call_id_position pos = call_id_position::NONE;
-
-    std::string prefix;  // e.g., "[CALL_ID]" (marker before call ID value)
-    std::string suffix;  // e.g., "" (marker after call ID value, before next section)
-};
-
-// ============================================================================
-// Parser build context (shared interface for build_parser methods)
-// ============================================================================
-
-struct analyze_content;
-
-struct parser_build_context {
-    common_chat_peg_builder & p;
-    const templates_params &          inputs;
-    common_peg_parser                 reasoning_parser;
-    bool                              extracting_reasoning = false;
-    const analyze_content *           content              = nullptr;
-
-    parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
-};
-
-// ============================================================================
-// Base class for analyzers with parser building
-// ============================================================================
-
-struct analyze_base {
-    virtual ~analyze_base() = default;
-    virtual common_peg_parser build_parser(parser_build_context & ctx) const = 0;
-
-  protected:
-    const common_chat_template * tmpl = nullptr;
-
-    analyze_base() = default;
-    explicit analyze_base(const common_chat_template & tmpl) : tmpl(&tmpl) {}
-};
-
-// ============================================================================
-// Reasoning analyzer
-// ============================================================================
-
-struct analyze_reasoning : analyze_base {
-    reasoning_mode mode = reasoning_mode::NONE;
-
-    std::string start;  // e.g., "<think>", "[THINK]", "<|START_THINKING|>", ""
-    std::string end;    // e.g., "</think>", "[BEGIN FINAL RESPONSE]", "<|END_THINKING|>"
-
-    analyze_reasoning() = default;
-    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-  private:
-    // Look for reasoning markers in rendered content
-    void compare_reasoning_presence();
-
-    // Compare generation prompt with enable_thinking=true vs false
-    void compare_thinking_enabled();
-
-    // Check if reasoning is always possible or only in tool calls
-    void compare_reasoning_scope();
-};
-
-// ============================================================================
-// Content analyzer
-// ============================================================================
-
-struct analyze_content : analyze_base {
-    content_mode mode = content_mode::PLAIN;
-
-    std::string start;  // e.g., "<response>", ">>>all\n", ""
-    std::string end;    // e.g., "</response>", ""
-
-    bool requires_nonnull_content = false;
-
-    analyze_content() = default;
-    analyze_content(const common_chat_template & tmpl, const analyze_reasoning & reasoning);
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-    bool is_always_wrapped() const;
-    common_peg_parser build_optional_wrapped(parser_build_context & ctx) const;
-};
-
-// ============================================================================
-// Tool analyzer
-// ============================================================================
-
-struct analyze_tools : analyze_base {
-    tool_format_analysis    format;
-    tool_function_analysis  function;
-    tool_arguments_analysis arguments;
-    tool_id_analysis        call_id;
-
-    analyze_tools() = default;
-    analyze_tools(const common_chat_template & tmpl,
-                  const jinja::caps &          caps,
-                  const analyze_reasoning &    reasoning);
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-  private:
-    // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
-    void analyze_tool_calls(const analyze_reasoning & reasoning);
-
-    // Analyze format based on position of function and argument name in needle
-    void analyze_tool_call_format(const std::string &       haystack,
-                                  const std::string &       fun_name_needle,
-                                  const std::string &       arg_name_needle,
-                                  const analyze_reasoning & reasoning);
-
-    // Analyze specifics of JSON native format (entire tool call is a JSON object)
-    void analyze_tool_call_format_json_native(const std::string & clean_haystack,
-                                              const std::string & fun_name_needle,
-                                              const std::string & arg_name_needle);
-
-    // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
-    void analyze_tool_call_format_non_json(const std::string & clean_haystack,
-                                           const std::string & fun_name_needle);
-
-    // Check for and extract specific per-call markers for non-native-JSON templates with parallel call support
-    void check_per_call_markers();
-
-    // Extract function name markers
-    void extract_function_markers();
-
-    // Delegates to separate functions for: separator analysis, argument name analysis, argument value analysis
-    void analyze_arguments();
-
-    // Extract argument name markers
-    void extract_argument_name_markers();
-
-    // Extract argument value markers
-    void extract_argument_value_markers();
-
-    // Extract argument separator, if specified (eg. <arg=foo>...</arg><sep><arg=bar>...</arg>)
-    void extract_argument_separator();
-
-    // Extract argument wrapper markers, if present (eg. '<args><arg=foo>...</arg><arg=bar>...</arg></args>')
-    void extract_args_markers();
-
-    // Extract call ID markers, if present
-    void extract_call_id_markers();
-
-    // Per-format tool parser builders
-    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
-    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
-    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
-};
-
-// ============================================================================
-// Main autoparser class
-// ============================================================================
-
-struct autoparser {
-    jinja::caps          jinja_caps;
-    analyze_reasoning    reasoning;
-    analyze_content      content;
-    analyze_tools        tools;
-    bool                 analysis_complete = false;
-
-    // Preserved tokens for tokenizer (union of all non-empty markers)
-    std::vector<std::string> preserved_tokens;
-
-    autoparser() = default;
-
-    // Run full differential analysis on a template
-    void analyze_template(const common_chat_template & tmpl);
-
-    // Build the PEG parser for this template
-    common_peg_arena build_parser(const templates_params & inputs) const;
-
-  private:
-    // Collect tokens from entire analysis to preserve
-    void collect_preserved_tokens();
-};
-
-// ============================================================================
-// Parser generator
-// ============================================================================
-
-class peg_generator {
-  public:
-    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct templates_params & inputs);
-
-    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct templates_params & inputs,
-                                              const autoparser &              autoparser);
-};
-
-}  // namespace autoparser
-
-enum segment_type { TEXT, MARKER };
-
-inline std::ostream & operator<<(std::ostream & os, const segment_type & type) {
-    switch (type) {
-        case segment_type::TEXT:
-            return os << "TEXT";
-        case segment_type::MARKER:
-            return os << "MARKER";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-struct segment {
-    segment_type type;
-    std::string  value;
-
-    segment(segment_type type, std::string value) : type(type), value(std::move(value)) {}
-
-    bool operator==(const segment & other) const {
-        return type == other.type && value == other.value;
-    }
-
-    bool operator!=(const segment & other) const {
-        return !(*this == other);
-    }
-};
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@ -0,0 +1,879 @@
+#include "chat.h"
+#include "chat-parser.h"
+#include "common.h"
+#include "json-partial.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "regex-partial.h"
+
+using json = nlohmann::ordered_json;
+
+class xml_toolcall_syntax_exception : public std::runtime_error {
+  public:
+    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+template<typename T>
+inline void sort_uniq(std::vector<T> &vec) {
+    std::sort(vec.begin(), vec.end());
+    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
+}
+
+template<typename T>
+inline bool all_space(const T &str) {
+    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
+}
+
+static size_t utf8_truncate_safe(const std::string_view s) {
+    size_t len = s.size();
+    if (len == 0) return 0;
+    size_t i = len;
+    for (size_t back = 0; back < 4 && i > 0; ++back) {
+        --i;
+        unsigned char c = s[i];
+        if ((c & 0x80) == 0) {
+            return len;
+        } else if ((c & 0xC0) == 0xC0) {
+            size_t expected_len = 0;
+            if ((c & 0xE0) == 0xC0) expected_len = 2;
+            else if ((c & 0xF0) == 0xE0) expected_len = 3;
+            else if ((c & 0xF8) == 0xF0) expected_len = 4;
+            else return i;
+            if (len - i >= expected_len) {
+                return len;
+            } else {
+                return i;
+            }
+        }
+    }
+    return len - std::min(len, size_t(3));
+}
+
+inline void utf8_truncate_safe_resize(std::string &s) {
+    s.resize(utf8_truncate_safe(s));
+}
+
+inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
+    return s.substr(0, utf8_truncate_safe(s));
+}
+
+static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
+    if (literal1.size() == 0) return builder.try_find_literal(literal2);
+    const auto saved_pos = builder.pos();
+    while (auto res = builder.try_find_literal(literal1)) {
+        builder.consume_spaces();
+        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
+        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
+            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
+                res->prelude = builder.str({saved_pos, res->groups[0].begin});
+            }
+            builder.move_to(builder.pos() + match_len);
+            res->groups[0].end = builder.pos();
+            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
+            return res;
+        }
+        builder.move_to(res->groups[0].begin + 1);
+    }
+    builder.move_to(saved_pos);
+    return std::nullopt;
+}
+
+/**
+ * make a GBNF that accept any strings except those containing any of the forbidden strings.
+ */
+std::string make_gbnf_excluding(std::vector<std::string> forbids) {
+    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
+        if (c == '\\' || c == ']' || c == '^' || c == '-') {
+            std::string s = "\\";
+            s.push_back((char)c);
+            return s;
+        }
+        if (isprint(c)) {
+            return std::string(1, (char)c);
+        }
+        char buf[16];
+        snprintf(buf, 15, "\\x%02X", c);
+        return std::string(buf);
+    };
+    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
+        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
+        int i = l;
+        while (i < r) {
+            const std::string &s = forbids[i];
+            if ((int)s.size() == depth) {
+                ++i;
+                continue;
+            }
+            unsigned char c = (unsigned char)s[depth];
+            int j = i;
+            while (j < r && (int)forbids[j].size() > depth &&
+                   (unsigned char)forbids[j][depth] == c) {
+                ++j;
+            }
+            children.push_back({c, {i, j}});
+            i = j;
+        }
+        std::vector<std::string> alts;
+        if (!children.empty()) {
+            std::string cls;
+            for (auto &ch : children) cls += charclass_escape(ch.first);
+            alts.push_back(std::string("[^") + cls + "]");
+        }
+        for (auto &ch : children) {
+            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
+            if (!childExpr.empty()) {
+                std::string quoted_ch = "\"";
+                if (ch.first == '\\') quoted_ch += "\\\\";
+                else if (ch.first == '"') quoted_ch += "\\\"";
+                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
+                else {
+                    char buf[16];
+                    snprintf(buf, 15, "\\x%02X", ch.first);
+                    quoted_ch += buf;
+                }
+                quoted_ch += "\"";
+                std::string branch = quoted_ch + std::string(" ") + childExpr;
+                alts.push_back(branch);
+            }
+        }
+        if (alts.empty()) return "";
+        std::ostringstream oss;
+        oss << "( ";
+        for (size_t k = 0; k < alts.size(); ++k) {
+            if (k) oss << " | ";
+            oss << alts[k];
+        }
+        oss << " )";
+        return oss.str();
+    };
+    if (forbids.empty()) return "( . )*";
+    sort(forbids.begin(), forbids.end());
+    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
+    if (expr.empty()) {
+        std::string cls;
+        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
+        expr = std::string("( [^") + cls + "] )";
+    }
+    if (forbids.size() == 1)
+        return expr + "*";
+    else
+        return std::string("( ") + expr + " )*";
+}
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.tool_sep.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    std::string key_val_sep = form.key_val_sep;
+    if (form.key_val_sep2) {
+        key_val_sep += "\n";
+        key_val_sep += *form.key_val_sep2;
+    }
+    GGML_ASSERT(!key_val_sep.empty());
+
+    if (tools.is_array() && !tools.empty()) {
+        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
+            auto string_arg_val = form.last_val_end ?
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
+
+            std::vector<std::string> tool_rules;
+            for (const auto & tool : tools) {
+                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
+                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
+                    continue;
+                }
+                const auto & function = tool.at("function");
+                if (!function.contains("name") || !function.at("name").is_string()) {
+                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
+                    continue;
+                }
+                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
+                    continue;
+                }
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+
+                struct parameter_rule {
+                    std::string symbol_name;
+                    bool is_required;
+                };
+                std::vector<parameter_rule> arg_rules;
+                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
+                    continue;
+                } else {
+                    std::vector<std::string> requiredParameters;
+                    if (parameters.contains("required")) {
+                        try { parameters.at("required").get_to(requiredParameters); }
+                        catch (const std::runtime_error&) {
+                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
+                        }
+                    }
+                    sort_uniq(requiredParameters);
+                    for (const auto & [key, value] : parameters.at("properties").items()) {
+                        std::string quoted_key = key;
+                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
+                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
+                            quoted_key = gbnf_format_literal(key);
+                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
+                        }
+                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
+                            gbnf_format_literal(form.key_start) + " " +
+                            gbnf_format_literal(quoted_key) + " " +
+                            gbnf_format_literal(key_val_sep) + " " +
+                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
+                                    (form.raw_argval ?
+                                            string_arg_val :
+                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
+                                    ) :
+                                    builder.add_schema(name + "-arg-" + key, value)
+                            )
+                        ), required});
+                    }
+                }
+
+                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
+                decltype(next_arg_with_sep) next_arg = "\"\"";
+                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
+                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
+                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
+                    );
+                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
+                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
+                    );
+                }
+
+                std::string quoted_name = name;
+                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
+                    quoted_name = gbnf_format_literal(name);
+                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
+                }
+                quoted_name = gbnf_format_literal(quoted_name);
+                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
+                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
+                }
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                        gbnf_format_literal(form.tool_start) + " " +
+                        quoted_name + " " +
+                        gbnf_format_literal(form.tool_sep) + " " +
+                        next_arg
+                ));
+            }
+
+            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
+            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
+            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
+            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
+            builder.add_rule("root",
+                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
+                tool_call_multiple_with_end  + "?" +
+                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
+            );
+        });
+
+        // grammar trigger for tool call
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
+    }
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.key_val_sep.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    // Helper to choose return false or throw error
+    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
+        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
+        if (recovery) {
+            builder.move_to(start_pos);
+            return false;
+        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
+    };
+    // Drop substring from needle to end from a JSON
+    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
+        auto pos = json_str.rfind(needle);
+        if (pos == std::string::npos) {
+            return false;
+        }
+        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
+            unsigned char ch = static_cast<unsigned char>(json_str[i]);
+            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
+                return false;
+            }
+        }
+        if (pos != 0 && json_str[pos - 1] == '"') {
+            --pos;
+        }
+        json_str.resize(pos);
+        return true;
+    };
+    // Helper to generate a partial argument JSON
+    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
+        auto rest = builder.consume_rest();
+        utf8_truncate_safe_resize(rest);
+        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
+        auto tool_str = arguments.dump();
+        if (partial_json(tool_str)) {
+            if (builder.add_tool_call(function_name, "", tool_str)) {
+                return;
+            }
+        }
+        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
+    };
+    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
+    constexpr auto try_find_close = [](
+            common_chat_msg_parser & builder,
+            const std::string & end,
+            const std::optional<std::string> & alt_end,
+            const std::string & end_next,
+            const std::optional<std::string> & alt_end_next
+    ) {
+        auto saved_pos = builder.pos();
+        auto tc = builder.try_find_literal(end);
+        auto val_end_size = end.size();
+        if (alt_end) {
+            auto pos_1 = builder.pos();
+            builder.move_to(saved_pos);
+            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
+            if (alt_end_next) {
+                builder.move_to(saved_pos);
+                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
+                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
+                    tc2 = tc3;
+                }
+            }
+            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
+                tc = tc2;
+                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
+                builder.move_to(tc->groups[0].end);
+                val_end_size = alt_end->size();
+            } else {
+                builder.move_to(pos_1);
+            }
+        }
+        return std::make_pair(val_end_size, tc);
+    };
+    // Helper to find a val_end or last_val_end, returns matched pattern size
+    const auto try_find_val_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
+    };
+    // Helper to find a tool_end or last_tool_end, returns matched pattern size
+    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
+    };
+
+    bool recovery = true;
+    const auto start_pos = builder.pos();
+    if (!all_space(form.scope_start)) {
+        if (auto tc = builder.try_find_literal(form.scope_start)) {
+            if (all_space(tc->prelude)) {
+                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
+                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
+            } else {
+                builder.move_to(start_pos);
+                return false;
+            }
+        } else return false;
+    }
+    while (auto tc = builder.try_find_literal(form.tool_start)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                    gbnf_format_literal(form.tool_start).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            builder.move_to(tc->groups[0].begin - tc->prelude.size());
+            break;
+        }
+
+        // Find tool name
+        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
+        if (!func_name) {
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+        if (!func_name) {
+            // Partial tool name not supported
+            throw common_chat_msg_partial_exception("incomplete tool_call");
+        }
+        // If the model generate multiple tool call and the first tool call has no argument
+        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
+            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+
+        // Parse tool name
+        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
+        std::string function_name = string_strip(func_name->prelude);
+        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
+            if (string_starts_with(function_name, "functions.")) {
+                static const std::regex re(":\\d+$");
+                if (std::regex_search(function_name, re)) {
+                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
+                }
+            }
+        }
+
+        // Argument JSON
+        json arguments = json::object();
+
+        // Helper to generate a partial argument JSON
+        const auto gen_partial_args = [&](auto set_partial_arg) {
+            gen_partial_json(set_partial_arg, arguments, builder, function_name);
+        };
+
+        // Parse all arg_key/arg_value pairs
+        while (auto tc = builder.try_find_literal(form.key_start)) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                        gbnf_format_literal(form.key_start).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                builder.move_to(tc->groups[0].begin - tc->prelude.size());
+                break;
+            }
+            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
+                auto tool_call_arg = arguments.dump();
+                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+                    tool_call_arg.resize(tool_call_arg.size() - 1);
+                }
+                builder.add_tool_call(function_name, "", tool_call_arg);
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
+            }
+
+            // Parse arg_key
+            auto key_res = builder.try_find_literal(form.key_val_sep);
+            if (!key_res) {
+                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
+                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
+            }
+            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
+                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
+            }
+            auto &key = key_res->prelude;
+            recovery = false;
+
+            // Parse arg_value
+            if (form.key_val_sep2) {
+                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
+                    if (!all_space(tc->prelude)) {
+                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
+                                gbnf_format_literal(tc->prelude).c_str(),
+                                gbnf_format_literal(form.key_val_sep).c_str(),
+                                gbnf_format_literal(*form.key_val_sep2).c_str()
+                        );
+                        return return_error(builder, start_pos, false);
+                    }
+                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
+                    }
+                } else {
+                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
+                }
+            }
+            auto val_start = builder.pos();
+
+            // Test if arg_val is a partial JSON
+            std::optional<common_json> value_json = std::nullopt;
+            if (!form.raw_argval || !*form.raw_argval) {
+                try { value_json = builder.try_consume_json(); }
+                catch (const std::runtime_error&) { builder.move_to(val_start); }
+                // TODO: Delete this when json_partial adds top-level support for null/true/false
+                if (builder.pos() == val_start) {
+                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
+                    builder.consume_spaces();
+                    std::string_view sv = utf8_truncate_safe_view(builder.input());
+                    sv.remove_prefix(builder.pos());
+                    std::string rest = "a";
+                    if (sv.size() < 6) rest = sv;
+                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
+                        value_json = {123, {"123", "123"}};
+                        builder.consume_rest();
+                    } else {
+                        builder.move_to(val_start);
+                    }
+                }
+            }
+
+            // If it is a JSON and followed by </arg_value>, parse as json
+            // cannot support streaming because it may be a plain text starting with JSON
+            if (value_json) {
+                auto json_end = builder.pos();
+                builder.consume_spaces();
+                if (builder.pos() == builder.input().size()) {
+                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
+                        arguments[key] = value_json->json;
+                        auto json_str = arguments.dump();
+                        if (!value_json->healing_marker.json_dump_marker.empty()) {
+                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
+                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
+                        } else {
+                            GGML_ASSERT(json_str.back() == '}');
+                            json_str.resize(json_str.size() - 1);
+                        }
+                        builder.add_tool_call(function_name, "", json_str);
+                    } else {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    }
+                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
+                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
+                }
+                builder.move_to(json_end);
+                auto [val_end_size, tc] = try_find_val_end();
+                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
+                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
+                    } else arguments[key] = value_json->json;
+                } else builder.move_to(val_start);
+            }
+
+            // If not, parse as plain text
+            if (val_start == builder.pos()) {
+                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
+                    auto &value_str = value_plain->prelude;
+                    if (form.trim_raw_argval) value_str = string_strip(value_str);
+                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
+                        throw common_chat_msg_partial_exception(
+                                "Expected " + gbnf_format_literal(form.val_end) +
+                                " after " + gbnf_format_literal(form.key_val_sep) +
+                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                        );
+                    }
+                    arguments[key] = value_str;
+                } else {
+                    if (form.trim_raw_argval) {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
+                    } else {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
+                    }
+                    throw common_chat_msg_partial_exception(
+                            "Expected " + gbnf_format_literal(form.val_end) +
+                            " after " + gbnf_format_literal(form.key_val_sep) +
+                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                    );
+                }
+            }
+        }
+
+        // Consume closing tag
+        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                        gbnf_format_literal(form.tool_end).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                return return_error(builder, start_pos, recovery);
+            }
+            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
+                // Add the parsed tool call
+                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
+                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
+                }
+                recovery = false;
+                continue;
+            }
+        }
+
+        auto tool_call_arg = arguments.dump();
+        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+            tool_call_arg.resize(tool_call_arg.size() - 1);
+        }
+        builder.add_tool_call(function_name, "", tool_call_arg);
+        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
+    }
+    if (auto tc = builder.try_find_literal(form.scope_end)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                    gbnf_format_literal(form.scope_end).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            return return_error(builder, start_pos, recovery);
+        }
+    } else {
+        if (all_space(form.scope_end)) return true;
+        builder.consume_spaces();
+        if (builder.pos() == builder.input().size())
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                gbnf_format_literal(form.scope_end).c_str(),
+                gbnf_format_literal(builder.consume_rest()).c_str()
+        );
+        return return_error(builder, start_pos, recovery);
+    }
+
+    return true;
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
+    auto pos = pos_;
+    auto tsize = result_.tool_calls.size();
+    try { return parse_xml_tool_calls(*this, form); }
+    catch (const xml_toolcall_syntax_exception&) {}
+    move_to(pos);
+    result_.tool_calls.resize(tsize);
+    return false;
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
+    constexpr auto rstrip = [](std::string &s) {
+        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
+    };
+    // Erase substring from l to r, along with additional spaces nearby
+    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
+        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
+        ++l;
+        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
+        if (l < r) str[l] = '\n';
+        if (l + 1 < r) str[l + 1] = '\n';
+        if (l != 0) l += 2;
+        str.erase(l, r - l);
+        return l;
+    };
+    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
+        auto best_match = content.size();
+        for (auto pattern: list) {
+            if (pattern.size() == 0) continue;
+            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
+                auto match_len = content.size() - match_idx;
+                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
+                    best_match = match_idx;
+                }
+            }
+        }
+        if (content.size() > best_match) {
+            content.erase(best_match);
+        }
+    };
+    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
+        return trim_suffix(content, {
+            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
+            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
+            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
+            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
+            form.scope_end
+        });
+    };
+
+
+    // Trim leading spaces without affecting keyword matching
+    static const common_regex spaces_regex("\\s*");
+    {
+        auto tc = builder.consume_regex(spaces_regex);
+        auto spaces = builder.str(tc.groups[0]);
+        auto s1 = spaces.size();
+        trim_potential_partial_word(spaces);
+        auto s2 = spaces.size();
+        builder.move_to(builder.pos() - (s1 - s2));
+    }
+
+    // Parse content
+    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
+    std::string unclosed_reasoning_content("");
+    for (;;) {
+        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
+        std::string content;
+        std::string tool_call_start;
+
+        if (tc) {
+            content = std::move(tc->prelude);
+            tool_call_start = builder.str(tc->groups[0]);
+            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
+        } else {
+            content = builder.consume_rest();
+            utf8_truncate_safe_resize(content);
+        }
+
+        // Handle unclosed think block
+        if (reasoning_unclosed) {
+            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
+                unclosed_reasoning_content += content;
+                if (!(form.allow_toolcall_in_think && tc)) {
+                    unclosed_reasoning_content += tool_call_start;
+                    continue;
+                }
+            } else {
+                reasoning_unclosed = false;
+                std::string reasoning_content;
+                if (pos == std::string::npos) {
+                    reasoning_content = std::move(content);
+                } else {
+                    reasoning_content = content.substr(0, pos);
+                    content.erase(0, pos + end_think.size());
+                }
+                if (builder.pos() == builder.input().size() && all_space(content)) {
+                    rstrip(reasoning_content);
+                    trim_potential_partial_word(reasoning_content);
+                    rstrip(reasoning_content);
+                    if (reasoning_content.empty()) {
+                        rstrip(unclosed_reasoning_content);
+                        trim_potential_partial_word(unclosed_reasoning_content);
+                        rstrip(unclosed_reasoning_content);
+                        if (unclosed_reasoning_content.empty()) continue;
+                    }
+                }
+                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+                    builder.add_content(start_think);
+                    builder.add_content(unclosed_reasoning_content);
+                    builder.add_content(reasoning_content);
+                    if (builder.pos() != builder.input().size() || !all_space(content))
+                        builder.add_content(end_think);
+                } else {
+                    builder.add_reasoning_content(unclosed_reasoning_content);
+                    builder.add_reasoning_content(reasoning_content);
+                }
+                unclosed_reasoning_content.clear();
+            }
+        }
+
+        // Handle multiple think block
+        bool toolcall_in_think = false;
+        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
+            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
+                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
+                    builder.add_reasoning_content(reasoning_content);
+                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
+                } else {
+                    think_start = think_end + end_think.size() - 1;
+                }
+            } else {
+                // This <tool_call> start is in thinking block, skip this tool call
+                // This <tool_call> start is in thinking block
+                if (form.allow_toolcall_in_think) {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
+                } else {
+                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
+                }
+                reasoning_unclosed = true;
+                content.resize(think_start);
+                toolcall_in_think = true;
+            }
+        }
+
+        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+            rstrip(content);
+            // Handle unclosed </think> token from content: delete all </think> token
+            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
+                while (pos != std::string::npos) {
+                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
+                    pos = content.rfind(end_think, pos);
+                }
+            }
+            // Strip if needed
+            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
+                content = string_strip(content);
+            }
+        }
+
+        // remove potential partial suffix
+        if (builder.pos() == builder.input().size() && builder.is_partial()) {
+            if (unclosed_reasoning_content.empty()) {
+                rstrip(content);
+                trim_potential_partial_word(content);
+                rstrip(content);
+            } else {
+                rstrip(unclosed_reasoning_content);
+                trim_potential_partial_word(unclosed_reasoning_content);
+                rstrip(unclosed_reasoning_content);
+            }
+        }
+
+        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
+        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                builder.add_reasoning_content(unclosed_reasoning_content);
+            } else {
+                if (content.empty()) {
+                    content = start_think + unclosed_reasoning_content;
+                } else {
+                    content += "\n\n" + start_think;
+                    content += unclosed_reasoning_content;
+                }
+            }
+            unclosed_reasoning_content.clear();
+        }
+
+        // Add content
+        if (!content.empty()) {
+            // If there are multiple content blocks
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
+                builder.add_content("\n\n");
+            }
+            builder.add_content(content);
+        }
+
+        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
+        if (toolcall_in_think && !form.allow_toolcall_in_think) {
+            continue;
+        }
+
+        // There is no tool call and all content is parsed
+        if (!tc) {
+            GGML_ASSERT(builder.pos() == builder.input().size());
+            GGML_ASSERT(unclosed_reasoning_content.empty());
+            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
+            break;
+        }
+
+        builder.move_to(tc->groups[0].begin);
+        if (builder.try_consume_xml_tool_calls(form)) {
+            auto end_of_tool = builder.pos();
+            builder.consume_spaces();
+            if (builder.pos() != builder.input().size()) {
+                builder.move_to(end_of_tool);
+                if (!builder.result().content.empty()) {
+                    builder.add_content("\n\n");
+                }
+            }
+        } else {
+            static const common_regex next_char_regex(".");
+            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
+            rstrip(c);
+            builder.add_content(c);
+        }
+    }
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ */
+void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
+    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
+}
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include "chat.h"
+
+#include <nlohmann/json.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+
+// Sample config:
+// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
+// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
+struct xml_tool_call_format {
+    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
+    std::string tool_start;  // <invoke name=\"        // <tool_call>
+    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
+    std::string key_start;   // <parameter name=\"     // <arg_key>
+    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
+    std::string val_end;     // </parameter>\n         // </arg_value>\n
+    std::string tool_end;    // </invoke>\n            // </tool_call>\n
+    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
+    // Set this if there can be dynamic spaces inside key_val_sep.
+    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
+    std::optional<std::string> key_val_sep2 = std::nullopt;
+    // Set true if argval should only be raw string. e.g. Hello "world" hi
+    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
+    // Defaults to std::nullopt, both will be allowed.
+    std::optional<bool> raw_argval = std::nullopt;
+    std::optional<std::string> last_val_end = std::nullopt;
+    std::optional<std::string> last_tool_end = std::nullopt;
+    bool trim_raw_argval = false;
+    bool allow_toolcall_in_think = false;
+};
+
+// make a GBNF that accept any strings except those containing any of the forbidden strings.
+std::string make_gbnf_excluding(std::vector<std::string> forbids);
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@ -0,0 +1,133 @@
+#pragma once
+
+#include "chat.h"
+#include "chat-parser-xml-toolcall.h"
+#include "json-partial.h"
+#include "regex-partial.h"
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+class common_chat_msg_partial_exception : public std::runtime_error {
+  public:
+    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+class common_chat_msg_parser {
+    std::string input_;
+    bool is_partial_;
+    common_chat_parser_params syntax_; // TODO: rename to params
+    std::string healing_marker_;
+
+    size_t pos_ = 0;
+    common_chat_msg result_;
+
+  public:
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+    const std::string & input() const { return input_; }
+    size_t pos() const { return pos_; }
+    const std::string & healing_marker() const { return healing_marker_; }
+    const bool & is_partial() const { return is_partial_; }
+    const common_chat_msg & result() const { return result_; }
+    const common_chat_parser_params & syntax() const { return syntax_; }
+
+    void move_to(size_t pos) {
+        if (pos > input_.size()) {
+            throw std::runtime_error("Invalid position!");
+        }
+        pos_ = pos;
+    }
+    void move_back(size_t n) {
+        if (pos_ < n) {
+            throw std::runtime_error("Can't move back that far!");
+        }
+        pos_ -= n;
+    }
+
+    // Get the substring of the input at the given range
+    std::string str(const common_string_range & rng) const;
+
+    // Appends to the result.content field
+    void add_content(const std::string & content);
+
+    // Appends to the result.reasoning_content field
+    void add_reasoning_content(const std::string & reasoning_content);
+
+    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+
+    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+    bool add_tool_call(const nlohmann::ordered_json & tool_call);
+
+    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+    bool add_tool_calls(const nlohmann::ordered_json & arr);
+
+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
+    void finish();
+
+    bool consume_spaces();
+
+    void consume_literal(const std::string & literal);
+
+    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+
+    std::string consume_rest();
+
+    struct find_regex_result {
+        std::string prelude;
+        std::vector<common_string_range> groups;
+    };
+
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
+
+    bool try_consume_literal(const std::string & literal);
+
+    std::optional<find_regex_result> try_find_literal(const std::string & literal);
+
+    find_regex_result consume_regex(const common_regex & regex);
+
+    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
+
+    std::optional<common_json> try_consume_json();
+    common_json consume_json();
+
+    struct consume_json_result {
+        nlohmann::ordered_json value;
+        bool is_partial;
+    };
+
+    /*
+        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+
+        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+
+        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+    */
+    consume_json_result consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    std::optional<consume_json_result> try_consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+
+    /**
+     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+     * form.scope_start, form.tool_sep and form.scope_end can be empty.
+     */
+    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
+
+    // Parse content uses reasoning and XML-Style tool call
+    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
+
+    void clear_tools();
+};
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -1,17 +1,13 @@
 #include "chat-peg-parser.h"

-#include "chat-auto-parser.h"
-#include "ggml.h"
-#include "peg-parser.h"
-
 #include <nlohmann/json.hpp>

-using ordered_json = nlohmann::ordered_json;
+using json = nlohmann::json;

 static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
    int count = 0;
    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
-        if (max != -1 && count >= max) {
+        if (max != -1 && count <= max) {
            break;
        }
        sv.remove_suffix(1);
@ -20,820 +16,109 @@ static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
    return sv;
 }

-static std::string_view trim_leading_space(std::string_view sv, int max = -1) {
-    int count = 0;
-    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.front()))) {
-        if (max != -1 && count >= max) {
-            break;
-        }
-        sv.remove_prefix(1);
-        count++;
-    }
-    return sv;
-}
-
-static std::string_view trim(std::string_view sv) {
-    return trim_trailing_space(trim_leading_space(sv, 1));
-}
-
-// Count the number of unclosed '{' braces in a JSON-like string,
-// properly skipping braces inside quoted strings.
-static int json_brace_depth(const std::string & s) {
-    int  depth     = 0;
-    bool in_string = false;
-    bool escaped   = false;
-    for (char c : s) {
-        if (escaped) {
-            escaped = false;
-            continue;
-        }
-        if (c == '\\' && in_string) {
-            escaped = true;
-            continue;
-        }
-        if (c == '"') {
-            in_string = !in_string;
-            continue;
-        }
-        if (!in_string) {
-            if (c == '{') {
-                depth++;
-            } else if (c == '}') {
-                depth--;
-            }
-        }
-    }
-    return depth;
-}
-
-// JSON-escape a string and return the inner content (without surrounding quotes).
-static std::string escape_json_string_inner(const std::string & s) {
-    std::string escaped = ordered_json(s).dump();
-    if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
-        return escaped.substr(1, escaped.size() - 2);
-    }
-    return escaped;
-}
-
-// Convert Python-style single-quoted strings to JSON double-quoted strings
-// Only converts outer string delimiters, properly handling escape sequences:
-// - {'key': 'value'} -> {"key": "value"}
-// - {'code': 'print(\'hello\')'} -> {"code": "print('hello')"}
-// - {'msg': 'He said "hi"'} -> {"msg": "He said \"hi\""}
-static std::string normalize_quotes_to_json(const std::string & input) {
-    std::string result;
-    result.reserve(input.size() + 16);  // May need extra space for escaping
-
-    bool in_single_quoted = false;
-    bool in_double_quoted = false;
-
-    for (size_t i = 0; i < input.size(); ++i) {
-        char c = input[i];
-
-        // Handle escape sequences
-        if (c == '\\' && i + 1 < input.size()) {
-            char next = input[i + 1];
-
-            if (in_single_quoted) {
-                // Inside a single-quoted string being converted to double quotes
-                if (next == '\'') {
-                    // \' -> ' (escaped single quote becomes unescaped in double-quoted string)
-                    result += '\'';
-                    ++i;
-                    continue;
-                }
-                if (next == '"') {
-                    // \" stays as \" (already escaped, works in double-quoted string)
-                    result += "\\\"";
-                    ++i;
-                    continue;
-                }
-                // Other escapes (\n, \\, etc.): pass through both characters
-                result += c;
-                result += next;
-                ++i;
-                continue;
-            }
-
-            if (in_double_quoted) {
-                // Inside a double-quoted string - pass through escape sequences as-is
-                result += c;
-                result += next;
-                ++i;
-                continue;
-            }
-
-            // Outside any string - just pass through the backslash
-            result += c;
-            continue;
-        }
-
-        // Handle quote characters
-        if (c == '"') {
-            if (in_single_quoted) {
-                // Unescaped double quote inside single-quoted string -> must escape for JSON
-                result += "\\\"";
-            } else {
-                // Double quote as string delimiter or outside strings
-                in_double_quoted = !in_double_quoted;
-                result += c;
-            }
-        } else if (c == '\'') {
-            if (in_double_quoted) {
-                // Single quote inside double-quoted string -> pass through
-                result += c;
-            } else if (in_single_quoted) {
-                // Closing single quote -> convert to double quote
-                in_single_quoted = false;
-                result += '"';
-            } else {
-                // Opening single quote -> convert to double quote
-                in_single_quoted = true;
-                result += '"';
-            }
-        } else {
-            result += c;
-        }
-    }
-
-    return result;
-}
-
-void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
    arena.visit(result, [this](const common_peg_ast_node & node) {
-        if (!node.tag.empty()) {
-            tags[node.tag] = std::string(node.text);
-        }
+        map(node);
    });
 }

-tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const {
-    common_peg_parse_context ctx(input, flags | extra_flags);
-    auto parse_result = arena.parse(ctx);
-
-    tag_based_peg_mapper mapper;
-    mapper.from_ast(ctx.ast, parse_result);
-
-    return { std::move(parse_result), std::move(mapper.tags) };
-}
-
-tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const {
-    if (input.empty()) {
-        return parse_and_extract(input);
-    }
-    for (size_t i = 0; i < input.size(); i++) {
-        common_peg_parse_context ctx(input, flags);
-        auto parse_result = arena.parse(ctx, i);
-        if (parse_result.success() || i == input.size() - 1) {
-            tag_based_peg_mapper mapper;
-            mapper.from_ast(ctx.ast, parse_result);
-            return { std::move(parse_result), std::move(mapper.tags) };
-        }
-    }
-    GGML_ABORT("Should not happen");
-}
-
-tagged_peg_parser build_tagged_peg_parser(
-    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
-    common_peg_parser_builder builder;
-    builder.set_root(fn(builder));
-    return { builder.build() };
-}
-
-common_peg_parser common_chat_peg_builder::tag_with_safe_content(const std::string &       tag_name,
-                                                                 const std::string &       marker,
-                                                                 const common_peg_parser & p) {
-    if (marker.empty()) {
-        return zero_or_more(choice({ p, rule(tag_name, content(any())) }));
-    }
-    auto content_chunk = rule(tag_name, content(negate(literal(marker)) + any() + until(marker)));
-    return zero_or_more(choice({ p, content_chunk }));
-}
-
-std::string & common_chat_peg_mapper::args_target() {
-    return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
-}
-
-void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
-                                      const common_peg_parse_result & parse_result_arg) {
-    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
-    // Flush any pending tool call that was started but never got a name
-    // This happens during partial parsing when the tool call is incomplete
-    if (pending_tool_call.has_value() && !pending_tool_call->name.empty()) {
-        if (!args_buffer.empty()) {
-            pending_tool_call->arguments = args_buffer;
-        }
-        if (closing_quote_pending && !pending_tool_call->arguments.empty()) {
-            pending_tool_call->arguments += "\"";
-        }
-        result.tool_calls.push_back(pending_tool_call.value());
-        pending_tool_call.reset();
-    }
-}
-
 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
-    // Handle reasoning/content tags
    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
-    bool is_content   = node.tag == common_chat_peg_builder::CONTENT;
+    bool is_content = node.tag == common_chat_peg_builder::CONTENT;

-    if (is_reasoning) { // GPT OSS can have more than 1 reasoning block, so concatenate here
-        result.reasoning_content += std::string(node.text);
+    if (is_reasoning) {
+        result.reasoning_content = std::string(trim_trailing_space(node.text));
    }

    if (is_content) {
-        // Concatenate content from multiple content nodes (e.g., when reasoning markers
-        // are preserved before content markers in reasoning_format=NONE mode)
-        result.content += std::string(node.text);
+        result.content = std::string(trim_trailing_space(node.text));
    }
+}

-    // Handle tool-related tags (supporting both JSON and tagged formats)
-    bool is_tool_open  = node.tag == common_chat_peg_builder::TOOL_OPEN;
-    bool is_tool_close = node.tag == common_chat_peg_builder::TOOL_CLOSE;
-    bool is_tool_name  = node.tag == common_chat_peg_builder::TOOL_NAME;
-    bool is_tool_id    = node.tag == common_chat_peg_builder::TOOL_ID;
-    bool is_tool_args  = node.tag == common_chat_peg_builder::TOOL_ARGS;
-    bool is_arg_open   = node.tag == common_chat_peg_builder::TOOL_ARG_OPEN;
-    bool is_arg_close  = node.tag == common_chat_peg_builder::TOOL_ARG_CLOSE;
-    bool is_arg_name         = node.tag == common_chat_peg_builder::TOOL_ARG_NAME;
-    bool is_arg_value        = node.tag == common_chat_peg_builder::TOOL_ARG_VALUE;
-    bool is_arg_string_value = node.tag == common_chat_peg_builder::TOOL_ARG_STRING_VALUE;
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;

    if (is_tool_open) {
-        pending_tool_call     = common_chat_tool_call();
-        current_tool          = &pending_tool_call.value();
-        arg_count             = 0;
-        args_buffer.clear();
-        closing_quote_pending = false;
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
    }

    if (is_tool_id && current_tool) {
-        auto text = trim_trailing_space(node.text);
-        if (text.size() >= 2 && text.front() == '"' && text.back() == '"') {
-            text = text.substr(1, text.size() - 2);
-        }
-        current_tool->id = std::string(text);
+        current_tool->id = std::string(trim_trailing_space(node.text));
    }

    if (is_tool_name && current_tool) {
        current_tool->name = std::string(trim_trailing_space(node.text));
-        // Now that we have the name, populate the arguments from the buffer
-        if (!args_buffer.empty()) {
-            current_tool->arguments = args_buffer;
-            args_buffer.clear();
-        } else if (current_tool->arguments.empty()) {
-            current_tool->arguments = "{";
-        }
-        // Add the tool call to results so streaming can see it
-        if (pending_tool_call.has_value()) {
-            result.tool_calls.push_back(pending_tool_call.value());
-            pending_tool_call.reset();
-            current_tool = &result.tool_calls.back();
-        }
    }

    if (is_tool_args && current_tool) {
-        // For JSON format: arguments come as a complete JSON object
-        // For tagged format: built up from individual arg_name/arg_value nodes
-        auto text = trim_trailing_space(node.text);
-        if (!text.empty() && text.front() == '{') {
-            args_target() = std::string(text);
-        }
+        current_tool->arguments = std::string(trim_trailing_space(node.text));
+    }
+}
+
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+        arg_count = 0;
+    }
+
+    if (is_tool_name) {
+        current_tool->name = std::string(node.text);
+        current_tool->arguments = "{";
    }

    if (is_arg_open) {
-        closing_quote_pending = false;
+        needs_closing_quote = false;
    }

    if (is_arg_name && current_tool) {
-        std::string arg_entry;
        if (arg_count > 0) {
-            arg_entry = ",";
+            current_tool->arguments += ",";
        }
-        arg_entry += ordered_json(trim(node.text)).dump() + ":";
+        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
        ++arg_count;
-
-        auto & target = args_target();
-        if (target.empty()) {
-            target = "{";
-        }
-        target += arg_entry;
    }

-    if ((is_arg_value || is_arg_string_value) && current_tool) {
-        std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
-
-        std::string value_to_add;
-        if (value_content.empty() && is_arg_string_value) {
-            // Empty string value - arg_close will add the closing quote
-            value_to_add          = "\"";
-            closing_quote_pending = true;
-        } else if (!value_content.empty() && is_arg_string_value) {
-            // Schema declares this as string type - always treat as literal string value
-            if (!closing_quote_pending) {
-                value_to_add          = "\"";
-                closing_quote_pending = true;
-            }
-            value_to_add += escape_json_string_inner(value_content);
-        } else if (!value_content.empty()) {
-            // For potential containers, normalize Python-style single quotes to JSON double quotes
-            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
-            if (is_potential_container) {
-                value_content = normalize_quotes_to_json(value_content);
-            }
-
-            // Try to parse as JSON value (number, bool, null, object, array)
-            try {
-                ordered_json parsed = ordered_json::parse(value_content);
-                if (parsed.is_string()) {
-                    // Don't add closing quote yet (added by arg_close) for monotonic streaming
-                    std::string escaped = parsed.dump();
-                    if (!escaped.empty() && escaped.back() == '"') {
-                        escaped.pop_back();
-                    }
-                    value_to_add          = escaped;
-                    closing_quote_pending = true;
-                } else {
-                    // Non-string values: use raw content to preserve whitespace for monotonicity
-                    value_to_add = value_content;
-                }
-            } catch (...) {
-                if (node.is_partial && is_potential_container) {
-                    // Partial container: pass through the already-normalized content
-                    value_to_add = value_content;
-                } else {
-                    // Not valid JSON - treat as string value
-                    if (!closing_quote_pending) {
-                        value_to_add          = "\"";
-                        closing_quote_pending = true;
-                    }
-                    value_to_add += escape_json_string_inner(value_content);
-                }
-            }
-        }
-
-        args_target() += value_to_add;
+    if (is_arg_string && current_tool) {
+        // Serialize to JSON, but exclude the end quote
+        std::string dumped = json(trim_trailing_space(node.text)).dump();
+        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+        needs_closing_quote = true;
    }

    if (is_arg_close && current_tool) {
-        if (closing_quote_pending) {
-            args_target() += "\"";
-            closing_quote_pending = false;
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+            needs_closing_quote = false;
        }
    }

+    if (is_arg_json && current_tool) {
+        current_tool->arguments += std::string(trim_trailing_space(node.text));
+    }
+
    if (is_tool_close && current_tool) {
-        // Flush buffer to arguments if tool name was never seen
-        if (current_tool->name.empty() && !args_buffer.empty()) {
-            current_tool->arguments = args_buffer;
-            args_buffer.clear();
-        }
-        // Close any pending string quote
-        if (closing_quote_pending) {
+        if (needs_closing_quote) {
            current_tool->arguments += "\"";
-            closing_quote_pending = false;
-        }
-        // Close any unclosed braces (accounts for nested objects)
-        for (int d = json_brace_depth(current_tool->arguments); d > 0; d--) {
-            current_tool->arguments += "}";
-        }
-        // Add tool call to results if named; otherwise discard
-        if (pending_tool_call.has_value()) {
-            if (!current_tool->name.empty()) {
-                result.tool_calls.push_back(pending_tool_call.value());
-            }
-            pending_tool_call.reset();
+            needs_closing_quote = false;
        }
+        current_tool->arguments += "}";
    }
 }
-
-common_peg_parser common_chat_peg_builder::standard_constructed_tools(
-    const std::map<std::string, std::string> & markers,
-    const ordered_json &                       tools,
-    bool                                       parallel_tool_calls,
-    bool                                       force_tool_calls) {
-    if (!tools.is_array() || tools.empty()) {
-        return eps();
-    }
-
-    // Extract markers with defaults
-    auto get_marker = [&markers](const std::string & key, const std::string & default_val = "") -> std::string {
-        auto it = markers.find(key);
-        return it != markers.end() ? it->second : default_val;
-    };
-
-    std::string section_start    = get_marker("tool_call_start_marker", "<tool_call>");
-    std::string section_end      = get_marker("tool_call_end_marker", "</tool_call>");
-    std::string func_opener      = get_marker("function_opener", "<function=");
-    std::string func_name_suffix = get_marker("function_name_suffix", ">");
-    std::string func_closer      = get_marker("function_closer", "</function>");
-    std::string param_key_prefix = get_marker("parameter_key_prefix", "<param=");
-    std::string param_key_suffix = get_marker("parameter_key_suffix", ">");
-    std::string param_closer     = get_marker("parameter_closer", "</param>");
-
-    // Build tool choices for tagged format
-    auto tool_choices = choice();
-
-    for (const auto & tool_def : tools) {
-        if (!tool_def.contains("function")) {
-            continue;
-        }
-        const auto &   function = tool_def.at("function");
-        std::string    name     = function.at("name");
-        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
-
-        // Build argument parsers
-        auto args = eps();
-        if (params.contains("properties") && !params["properties"].empty()) {
-            auto arg_choice = choice();
-            for (const auto & el : params["properties"].items()) {
-                const std::string & prop_name = el.key();
-
-                auto arg_name_parser =
-                    choice({ literal(prop_name), literal("\"" + prop_name + "\""), literal("'" + prop_name + "'") });
-
-                auto arg_rule = tool_arg(tool_arg_open(literal(param_key_prefix)) + tool_arg_name(arg_name_parser) +
-                                         literal(param_key_suffix) + tool_arg_value(until(param_closer)) +
-                                         tool_arg_close(literal(param_closer)));
-                arg_choice |= arg_rule;
-            }
-            args = zero_or_more(arg_choice + space());
-        }
-
-        // Build function parser: <function=name>args</function>
-        auto tool_parser = tool(tool_open(literal(func_opener) + tool_name(literal(name)) + literal(func_name_suffix)) +
-                                space() + tool_args(args) + space() + tool_close(literal(func_closer)));
-
-        tool_choices |= rule("tool-" + name, tool_parser);
-    }
-
-    // Build the section with markers
-    auto section =
-        parallel_tool_calls ?
-            trigger_rule("tool-call", literal(section_start) + space() + one_or_more(tool_choices + space()) +
-                                          literal(section_end)) :
-            trigger_rule("tool-call", literal(section_start) + space() + tool_choices + space() + literal(section_end));
-
-    return force_tool_calls ? section : optional(section);
-}
-
-// Python-style tool calls: name(arg1="value1", arg2=123)
-// Used only by LFM2 for now, so we don't merge it into autoparser
-common_peg_parser common_chat_peg_builder::python_style_tool_calls(
-    const ordered_json & tools,
-    bool                 parallel_tool_calls) {
-    if (!tools.is_array() || tools.empty()) {
-        return eps();
-    }
-
-    auto tool_choices = choice();
-
-    for (const auto & tool_def : tools) {
-        if (!tool_def.contains("function")) {
-            continue;
-        }
-        const auto &   function = tool_def.at("function");
-        std::string    name     = function.at("name");
-        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
-
-        auto args = eps();
-        if (params.contains("properties") && !params["properties"].empty()) {
-            auto arg_choice = choice();
-            for (const auto & el : params["properties"].items()) {
-                const std::string & prop_name = el.key();
-                const auto & prop_def = el.value();
-                bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
-
-                auto arg_name_parser = literal(prop_name);
-
-                common_peg_parser arg_value_parser = eps();
-                auto string_value_parser = choice({
-                    literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
-                    literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
-                });
-
-                if (is_string_type) {
-                    arg_value_parser = string_value_parser;
-                } else {
-                    arg_value_parser = tool_arg_value(python_value());
-                }
-
-                // Full argument: name="value" or name=value
-                auto arg_rule = tool_arg(
-                    tool_arg_open(eps()) +
-                    tool_arg_name(arg_name_parser) +
-                    literal("=") +
-                    arg_value_parser +
-                    tool_arg_close(eps())
-                );
-                arg_choice |= arg_rule;
-            }
-
-            args = arg_choice + zero_or_more("," + space() + arg_choice);
-        }
-
-        auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
-            space() + tool_args(args) + space() + tool_close(literal(")"))
-        );
-
-        tool_choices |= rule("tool-" + name, tool_parser);
-    }
-
-    if (parallel_tool_calls) {
-        return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
-    }
-    return "[" + space() + tool_choices + space() + "]";
-}
-
-// Helper: Parse dot notation key into prefix and field name
-static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
-    auto dot_pos = key.find('.');
-    if (dot_pos == std::string::npos) {
-        return {"", key};  // Top-level field
-    }
-    return {key.substr(0, dot_pos), key.substr(dot_pos + 1)};
-}
-
-// Mode 1: function_is_key — parse {"function_name": {...}}
-common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
-    const ordered_json & tools,
-    const std::string &  args_key,
-    const std::string &  effective_args_key,
-    const std::string &  call_id_key,
-    const std::string &  gen_call_id_key) {
-
-    auto tool_choices = choice();
-
-    for (const auto & tool_def : tools) {
-        if (!tool_def.contains("function")) {
-            continue;
-        }
-        const auto &   function = tool_def.at("function");
-        std::string    name     = function.at("name");
-        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
-
-        // Build inner object fields
-        std::vector<common_peg_parser> inner_fields;
-
-        if (!call_id_key.empty()) {
-            auto id_parser = atomic(
-                literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
-                literal("\"") + tool_id(string_content('"')) + literal("\"")
-            );
-            inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space())));
-        }
-
-        if (!gen_call_id_key.empty()) {
-            auto gen_id_parser = atomic(
-                literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
-                choice({
-                    literal("\"") + tool_id(string_content('"')) + literal("\""),
-                    tool_id(json_number())
-                })
-            );
-            inner_fields.push_back(optional(gen_id_parser + space() + optional(literal(",") + space())));
-        }
-
-        // Arguments — either wrapped in args_key or parsed directly
-        common_peg_parser args_parser = eps();
-        if (args_key.empty()) {
-            args_parser = tool_args(schema(json(), "tool-" + name + "-schema", params));
-        } else {
-            args_parser = literal("\"" + effective_args_key + "\"") + space() + literal(":") + space() +
-                          tool_args(schema(json(), "tool-" + name + "-schema", params));
-        }
-        inner_fields.push_back(args_parser);
-
-        // Build inner object parser
-        common_peg_parser inner_object = eps();
-        if (args_key.empty() && inner_fields.size() == 1) {
-            inner_object = inner_fields[0];
-        } else {
-            inner_object = literal("{") + space();
-            for (size_t i = 0; i < inner_fields.size(); i++) {
-                inner_object = inner_object + inner_fields[i];
-                if (i < inner_fields.size() - 1) {
-                    inner_object = inner_object + space();
-                }
-            }
-            inner_object = inner_object + space() + literal("}");
-        }
-
-        auto tool_parser = tool(
-            tool_open(literal("{")) + space() +
-            literal("\"") + tool_name(literal(name)) + literal("\"") +
-            space() + literal(":") + space() +
-            inner_object +
-            space() + tool_close(literal("}"))
-        );
-
-        tool_choices |= rule("tool-" + name, tool_parser);
-    }
-
-    return tool_choices;
-}
-
-// Mode 2: Nested keys (dot notation like "function.name")
-common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
-    const ordered_json & tools,
-    const std::string &  effective_name_key,
-    const std::string &  effective_args_key,
-    const std::string &  call_id_key,
-    const std::string &  gen_call_id_key) {
-
-    auto tool_choices = choice();
-
-    auto name_spec = parse_key_spec(effective_name_key);
-    auto args_spec = parse_key_spec(effective_args_key);
-
-    std::string nested_prefix     = !name_spec.first.empty() ? name_spec.first  : args_spec.first;
-    std::string nested_name_field = !name_spec.first.empty() ? name_spec.second  : effective_name_key;
-    std::string nested_args_field = !args_spec.first.empty() ? args_spec.second  : effective_args_key;
-
-    for (const auto & tool_def : tools) {
-        if (!tool_def.contains("function")) {
-            continue;
-        }
-        const auto &   function = tool_def.at("function");
-        std::string    name     = function.at("name");
-        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
-
-        auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
-                          literal("\"") + tool_name(literal(name)) + literal("\"");
-        auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
-                          tool_args(schema(json(), "tool-" + name + "-schema", params));
-
-        auto nested_object = literal("{") + space() +
-                            nested_name + space() + literal(",") + space() +
-                            nested_args +
-                            space() + literal("}");
-
-        // Format: { id?, "function": {...} }
-        auto tool_parser_body = tool_open(literal("{")) + space();
-
-        if (!call_id_key.empty()) {
-            auto id_spec = parse_key_spec(call_id_key);
-            if (id_spec.first.empty()) {
-                auto id_parser = atomic(
-                    literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
-                    literal("\"") + tool_id(string_content('"')) + literal("\"")
-                );
-                tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space());
-            }
-        }
-
-        if (!gen_call_id_key.empty()) {
-            auto gen_id_spec = parse_key_spec(gen_call_id_key);
-            if (gen_id_spec.first.empty()) {
-                auto gen_id_parser = atomic(
-                    literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
-                    choice({
-                        literal("\"") + tool_id(string_content('"')) + literal("\""),
-                        tool_id(json_number())
-                    })
-                );
-                tool_parser_body = tool_parser_body + optional(gen_id_parser + space() + literal(",") + space());
-            }
-        }
-
-        auto nested_field = literal("\"" + nested_prefix + "\"") + space() + literal(":") + space() + nested_object;
-        tool_parser_body = tool_parser_body + nested_field + space() + tool_close(literal("}"));
-
-        tool_choices |= rule("tool-" + name, tool(tool_parser_body));
-    }
-
-    return tool_choices;
-}
-
-// Mode 3: Flat keys with optional ID fields and parameter ordering
-common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
-    const ordered_json &             tools,
-    const std::string &              effective_name_key,
-    const std::string &              effective_args_key,
-    const std::string &              call_id_key,
-    const std::string &              gen_call_id_key,
-    const std::vector<std::string> & parameters_order) {
-
-    auto tool_choices    = choice();
-    auto name_key_parser = literal("\"" + effective_name_key + "\"");
-    auto args_key_parser = literal("\"" + effective_args_key + "\"");
-
-    for (const auto & tool_def : tools) {
-        if (!tool_def.contains("function")) {
-            continue;
-        }
-        const auto &   function = tool_def.at("function");
-        std::string    name     = function.at("name");
-        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
-
-        auto tool_name_ = name_key_parser + space() + literal(":") + space() +
-                         literal("\"") + tool_name(literal(name)) + literal("\"");
-        auto tool_args_ = args_key_parser + space() + literal(":") + space() +
-                         tool_args(schema(json(), "tool-" + name + "-schema", params));
-
-        // Build ID parsers if keys are provided
-        common_peg_parser id_parser = eps();
-        if (!call_id_key.empty()) {
-            id_parser = atomic(
-                literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
-                choice({
-                    literal("\"") + tool_id(string_content('"')) + literal("\""),
-                    tool_id(json_number())
-                })
-            );
-        }
-
-        common_peg_parser gen_id_parser = eps();
-        if (!gen_call_id_key.empty()) {
-            gen_id_parser = atomic(
-                literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
-                choice({
-                    literal("\"") + tool_id(string_content('"')) + literal("\""),
-                    tool_id(json_number())
-                })
-            );
-        }
-
-        // Create (parser, key) pairs for all fields, then sort by parameters_order
-        std::vector<std::pair<common_peg_parser, std::string>> parser_pairs;
-        parser_pairs.emplace_back(tool_name_, effective_name_key);
-        parser_pairs.emplace_back(tool_args_, effective_args_key);
-        if (!call_id_key.empty()) {
-            parser_pairs.emplace_back(optional(id_parser), call_id_key);
-        }
-        if (!gen_call_id_key.empty()) {
-            parser_pairs.emplace_back(optional(gen_id_parser), gen_call_id_key);
-        }
-
-        std::sort(parser_pairs.begin(), parser_pairs.end(),
-            [&parameters_order](const auto & a, const auto & b) {
-                auto pos_a = std::find(parameters_order.begin(), parameters_order.end(), a.second);
-                auto pos_b = std::find(parameters_order.begin(), parameters_order.end(), b.second);
-                size_t idx_a = (pos_a == parameters_order.end()) ? parameters_order.size() : std::distance(parameters_order.begin(), pos_a);
-                size_t idx_b = (pos_b == parameters_order.end()) ? parameters_order.size() : std::distance(parameters_order.begin(), pos_b);
-                return idx_a < idx_b;
-            });
-
-        auto ordered_body = tool_open(literal("{")) + space();
-        for (size_t i = 0; i < parser_pairs.size(); i++) {
-            ordered_body = ordered_body + parser_pairs[i].first;
-            if (i < parser_pairs.size() - 1) {
-                ordered_body = ordered_body + space() + literal(",") + space();
-            }
-        }
-        ordered_body = ordered_body + space() + tool_close(literal("}"));
-
-        tool_choices |= rule("tool-" + name, tool(ordered_body));
-    }
-
-    return tool_choices;
-}
-
-common_peg_parser common_chat_peg_builder::standard_json_tools(
-                                                       const std::string &              section_start,
-                                                       const std::string &              section_end,
-                                                       const ordered_json &             tools,
-                                                       bool                             parallel_tool_calls,
-                                                       bool                             force_tool_calls,
-                                                       const std::string &              name_key,
-                                                       const std::string &              args_key,
-                                                       bool                             array_wrapped,
-                                                       bool                             function_is_key,
-                                                       const std::string &              call_id_key,
-                                                       const std::string &              gen_call_id_key,
-                                                       const std::vector<std::string> & parameters_order) {
-    if (!tools.is_array() || tools.empty()) {
-        return eps();
-    }
-
-    std::string effective_name_key = name_key.empty() ? "name" : name_key;
-    std::string effective_args_key = args_key.empty() ? "arguments" : args_key;
-
-    // Dispatch to the appropriate builder based on the JSON layout mode
-    common_peg_parser tool_choices = eps();
-    if (function_is_key) {
-        tool_choices = build_json_tools_function_is_key(tools, args_key, effective_args_key, call_id_key, gen_call_id_key);
-    } else {
-        auto name_spec = parse_key_spec(effective_name_key);
-        auto args_spec = parse_key_spec(effective_args_key);
-        if (!name_spec.first.empty() || !args_spec.first.empty()) {
-            tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
-        } else {
-            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
-        }
-    }
-
-    // Build the section with markers
-    auto tool_calls = tool_choices;
-    if (parallel_tool_calls) {
-        tool_calls = tool_calls + zero_or_more(space() + literal(",") + space() + tool_choices);
-    }
-
-    if (array_wrapped) {
-        tool_calls = literal("[") + space() + tool_calls + space() + literal("]");
-    }
-
-    auto section =
-        trigger_rule("tool-call", literal(section_start) + space() + tool_calls + space() + literal(section_end));
-
-    return force_tool_calls ? section : optional(section);
-}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@ -3,9 +3,22 @@
 #include "chat.h"
 #include "peg-parser.h"

-#include <map>
-#include <optional>
-#include <vector>
+class common_chat_peg_builder : public common_peg_parser_builder {
+  public:
+    static constexpr const char * REASONING_BLOCK = "reasoning-block";
+    static constexpr const char * REASONING = "reasoning";
+    static constexpr const char * CONTENT = "content";
+
+    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+
+inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
+    common_chat_peg_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}

 class common_chat_peg_mapper {
  public:
@ -13,169 +26,80 @@ class common_chat_peg_mapper {

    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}

-    virtual ~common_chat_peg_mapper() = default;
-
    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
    virtual void map(const common_peg_ast_node & node);
-    private:
-      // Tool call handling state
-      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
-      common_chat_tool_call *              current_tool          = nullptr;
-      int                                  arg_count             = 0;
-      bool                                 closing_quote_pending = false;
-      std::string                          args_buffer;  // Buffer to delay arguments until tool name is known
-
-      // Returns a reference to the active argument destination string.
-      // Before tool_name is known, writes go to args_buffer; after, to current_tool->arguments.
-      std::string & args_target();
 };

-struct content_structure;
-struct tool_call_structure;
-
-class common_chat_peg_builder : public common_peg_parser_builder {
+class common_chat_peg_native_builder : public common_chat_peg_builder {
  public:
-    // Tag constants (from former common_chat_peg_base_builder)
-    static constexpr const char * REASONING_BLOCK = "reasoning-block";
-    static constexpr const char * REASONING       = "reasoning";
-    static constexpr const char * CONTENT         = "content";
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_ID = "tool-id";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARGS = "tool-args";

-    // Tag constants
-    static constexpr const char * TOOL           = "tool";
-    static constexpr const char * TOOL_OPEN      = "tool-open";
-    static constexpr const char * TOOL_CLOSE     = "tool-close";
-    static constexpr const char * TOOL_ID        = "tool-id";
-    static constexpr const char * TOOL_NAME      = "tool-name";
-    static constexpr const char * TOOL_ARGS      = "tool-args";
-    static constexpr const char * TOOL_ARG       = "tool-arg";
-    static constexpr const char * TOOL_ARG_OPEN  = "tool-arg-open";
-    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
-    static constexpr const char * TOOL_ARG_NAME         = "tool-arg-name";
-    static constexpr const char * TOOL_ARG_VALUE        = "tool-arg-value";
-    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";  // For schema-declared string types
-
-    // Low-level tag methods (from former common_chat_peg_base_builder)
-    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
-
-    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
-
-    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
-
-    common_peg_parser tag_with_safe_content(const std::string &       tag_name,
-                        const std::string &       marker,
-                        const common_peg_parser & p);
-
-    // Low-level tag methods
    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+
+  public:
+    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+    void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
+    common_chat_peg_native_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARG = "tool-arg";
+    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
-    common_peg_parser tool_arg_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
-
-    // Use for schema-declared string types - won't be treated as potential JSON container
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
-
-    // Legacy-compatible helper for building standard JSON tool calls
-    // Used by tests and manual parsers
-    // name_key/args_key: JSON key names for function name and arguments
-    //   Empty or "name"/"arguments" will accept both common variations
-    //   Supports dot notation for nested objects (e.g., "function.name")
-    // array_wrapped: if true, tool calls are wrapped in JSON array [...]
-    // function_is_key: if true, function name is the JSON key (e.g., {"func_name": {...}})
-    // call_id_key: JSON key for string call ID (e.g., "id")
-    // gen_call_id_key: JSON key for generated integer call ID (e.g., "tool_call_id")
-    // parameters_order: order in which JSON fields should be parsed
-    common_peg_parser standard_json_tools(const std::string &              section_start,
-                                          const std::string &              section_end,
-                                          const nlohmann::ordered_json &   tools,
-                                          bool                             parallel_tool_calls,
-                                          bool                             force_tool_calls,
-                                          const std::string &              name_key = "",
-                                          const std::string &              args_key = "",
-                                          bool                             array_wrapped = false,
-                                          bool                             function_is_key = false,
-                                          const std::string &              call_id_key = "",
-                                          const std::string &              gen_call_id_key = "",
-                                          const std::vector<std::string> & parameters_order = {});
-
-    // Legacy-compatible helper for building XML/tagged style tool calls
-    // Used by tests and manual parsers
-    common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
-                                                 const nlohmann::ordered_json &             tools,
-                                                 bool                                       parallel_tool_calls,
-                                                 bool                                       force_tool_calls);
-
-    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
-    // Used by LFM2 and similar templates
-    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
-                                              bool                           parallel_tool_calls);
-
-  private:
-    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
-    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
-                                                       const std::string &            args_key,
-                                                       const std::string &            effective_args_key,
-                                                       const std::string &            call_id_key,
-                                                       const std::string &            gen_call_id_key);
-
-    common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
-                                                   const std::string &            effective_name_key,
-                                                   const std::string &            effective_args_key,
-                                                   const std::string &            call_id_key,
-                                                   const std::string &            gen_call_id_key);
-
-    common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json &   tools,
-                                                 const std::string &              effective_name_key,
-                                                 const std::string &              effective_args_key,
-                                                 const std::string &              call_id_key,
-                                                 const std::string &              gen_call_id_key,
-                                                 const std::vector<std::string> & parameters_order);
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
 };

-inline common_peg_arena build_chat_peg_parser(
-  const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
-  common_chat_peg_builder builder;
-  builder.set_root(fn(builder));
-  return builder.build();
-}
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+    int arg_count = 0;
+    bool needs_closing_quote = false;

-class tag_based_peg_mapper {
  public:
-    std::map<std::string, std::string> tags;
+    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}

-    void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+    void map(const common_peg_ast_node & node) override;
 };

-struct tagged_parse_result {
-    common_peg_parse_result              result;
-    std::map<std::string, std::string> tags;
-};
-
-struct tagged_peg_parser {
-    common_peg_arena arena;
-    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
-
-    tagged_peg_parser & withDebug() {
-      flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
-      return *this;
-    }
-
-    tagged_peg_parser & withoutDebug() {
-      flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
-      return *this;
-    }
-
-    tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
-    tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
-};
-
-tagged_peg_parser build_tagged_peg_parser(
-    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
-
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
+    common_chat_peg_constructed_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@ -3,30 +3,17 @@
 #pragma once

 #include "common.h"
-#include "jinja/parser.h"
-#include "nlohmann/json_fwd.hpp"
 #include "peg-parser.h"
-#include "jinja/runtime.h"
-#include "jinja/caps.h"
-#include "nlohmann/json.hpp"
-
-#include <chrono>
 #include <functional>
-#include <map>
+#include <chrono>
 #include <string>
 #include <vector>
-
-using chat_template_caps = jinja::caps;
-using json = nlohmann::ordered_json;
+#include <map>

 #include <nlohmann/json_fwd.hpp>

 struct common_chat_templates;

-namespace autoparser {
-struct templates_params;
-}  // namespace autoparser
-
 struct common_chat_tool_call {
    std::string name;
    std::string arguments;
@ -51,85 +38,21 @@ struct common_chat_msg_content_part {
    }
 };

-struct common_chat_template {
-    jinja::program prog;
-    std::string bos_tok;
-    std::string eos_tok;
-    std::string src;
-    chat_template_caps caps;
-
-    common_chat_template(const std::string & src, const std::string & bos_token, const std::string & eos_token) {
-        jinja::lexer lexer;
-        auto lexer_res = lexer.tokenize(src);
-        this->prog = jinja::parse_from_tokens(lexer_res);
-
-        this->src = lexer_res.source;
-        this->bos_tok = bos_token;
-        this->eos_tok = eos_token;
-
-        this->caps = jinja::caps_get(prog);
-        // LOG_INF("%s: caps:\n%s\n", __func__, this->caps.to_string().c_str());
-    }
-
-    const std::string & source() const { return src; }
-    const std::string & bos_token() const { return bos_tok; }
-    const std::string & eos_token() const { return eos_tok; }
-
-    // TODO: this is ugly, refactor it somehow
-    json add_system(const json & messages, const std::string & system_prompt) const {
-        GGML_ASSERT(messages.is_array());
-        auto msgs_copy = messages;
-        if (!caps.supports_system_role) {
-            if (msgs_copy.empty()) {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "user"},
-                    {"content", system_prompt}
-                });
-            } else {
-                auto & first_msg = msgs_copy[0];
-                if (!first_msg.contains("content")) {
-                    first_msg["content"] = "";
-                }
-                first_msg["content"] = system_prompt + "\n\n"
-                    + first_msg["content"].get<std::string>();
-            }
-        } else {
-            if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "system"},
-                    {"content", system_prompt}
-                });
-            } else if (msgs_copy[0].at("role") == "system") {
-                msgs_copy[0]["content"] = system_prompt;
-            }
-        }
-        return msgs_copy;
-    }
-
-    chat_template_caps original_caps() const {
-        return caps;
-    }
-
-};
-
 struct common_chat_msg {
-    std::string                               role;
-    std::string                               content;
+    std::string role;
+    std::string content;
    std::vector<common_chat_msg_content_part> content_parts;
-    std::vector<common_chat_tool_call>        tool_calls;
-    std::string                               reasoning_content;
-    std::string                               tool_name;
-    std::string                               tool_call_id;
+    std::vector<common_chat_tool_call> tool_calls;
+    std::string reasoning_content;
+    std::string tool_name;
+    std::string tool_call_id;

    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;

    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
-               tool_name.empty() && tool_call_id.empty();
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
    }
-
-    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
-                           const std::function<std::string()> & gen_tool_call_id) {
+    void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
                auto id = tool_calls[i].id;
@ -141,28 +64,32 @@ struct common_chat_msg {
            tool_calls[i].id = ids_cache[i];
        }
    }
-
    bool operator==(const common_chat_msg & other) const {
-        return role == other.role && content == other.content && content_parts == other.content_parts &&
-               tool_calls == other.tool_calls && reasoning_content == other.reasoning_content &&
-               tool_name == other.tool_name && tool_call_id == other.tool_call_id;
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
    }
-
-    bool operator!=(const common_chat_msg & other) const { return !(*this == other); }
 };

 struct common_chat_msg_diff {
-    std::string           reasoning_content_delta;
-    std::string           content_delta;
-    size_t                tool_call_index = std::string::npos;
+    std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;

-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv,
-                                                           const common_chat_msg & msg_new);
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);

    bool operator==(const common_chat_msg_diff & other) const {
-        return content_delta == other.content_delta && tool_call_index == other.tool_call_index &&
-               tool_call_delta == other.tool_call_delta;
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
    }
 };

@ -180,41 +107,64 @@ enum common_chat_tool_choice {

 enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_MAGISTRAL,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_GRANITE,
+    COMMON_CHAT_FORMAT_GPT_OSS,
+    COMMON_CHAT_FORMAT_SEED_OSS,
+    COMMON_CHAT_FORMAT_NEMOTRON_V2,
+    COMMON_CHAT_FORMAT_APERTUS,
+    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
+    COMMON_CHAT_FORMAT_GLM_4_5,
+    COMMON_CHAT_FORMAT_MINIMAX_M2,
+    COMMON_CHAT_FORMAT_KIMI_K2,
+    COMMON_CHAT_FORMAT_APRIEL_1_5,
+    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
+    COMMON_CHAT_FORMAT_SOLAR_OPEN,
+    COMMON_CHAT_FORMAT_EXAONE_MOE,

    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
    COMMON_CHAT_FORMAT_PEG_NATIVE,
+    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,

-    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

 struct common_chat_templates_inputs {
-    std::vector<common_chat_msg>          messages;
-    std::string                           grammar;
-    std::string                           json_schema;
-    bool                                  add_generation_prompt = true;
-    bool                                  use_jinja             = true;
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
    // Parameters below only supported when use_jinja is true
-    std::vector<common_chat_tool>         tools;
-    common_chat_tool_choice               tool_choice         = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    bool                                  parallel_tool_calls = false;
-    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
-    bool                                  enable_thinking     = true;
-    std::chrono::system_clock::time_point now                 = std::chrono::system_clock::now();
-    std::map<std::string, std::string>    chat_template_kwargs;
-    bool                                  add_bos = false;
-    bool                                  add_eos = false;
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
+    bool enable_thinking = true;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+    std::map<std::string, std::string> chat_template_kwargs;
+    bool add_bos = false;
+    bool add_eos = false;
 };

 struct common_chat_params {
    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
    std::string                         prompt;
    std::string                         grammar;
-    bool                                grammar_lazy         = false;
+    bool                                grammar_lazy = false;
    bool                                thinking_forced_open = false;
-    bool                                supports_thinking    = false;
-    std::string                         thinking_start_tag;  // e.g., "<think>"
-    std::string                         thinking_end_tag;    // e.g., "</think>"
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
@ -224,14 +174,13 @@ struct common_chat_params {
 // per-message parsing syntax
 // should be derived from common_chat_params
 struct common_chat_parser_params {
-    common_chat_format      format               = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
-    bool                    reasoning_in_content = false;
-    bool                    thinking_forced_open = false;
-    bool                    parse_tool_calls     = true;
-    bool                    debug                = false;  // Enable debug output for PEG parser
-    common_peg_arena        parser               = {};
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+    common_peg_arena         parser                = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
        format               = chat_params.format;
@ -244,42 +193,45 @@ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

 void common_chat_templates_free(struct common_chat_templates * tmpls);

-struct common_chat_templates_deleter {
-    void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); }
-};
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };

 typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;

-common_chat_templates_ptr common_chat_templates_init(const struct llama_model * model,
-                                                     const std::string &        chat_template_override,
-                                                     const std::string &        bos_token_override = "",
-                                                     const std::string &        eos_token_override = "");
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");

 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
 std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");

-struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
-                                                      const struct common_chat_templates_inputs & inputs);
+
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);

 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct common_chat_templates * tmpls,
-                                      const std::vector<common_chat_msg> & past_msg,
-                                      const common_chat_msg &              new_msg,
-                                      bool                                 add_ass,
-                                      bool                                 use_jinja);
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);

 // Returns an example of formatted chat
-std::string common_chat_format_example(const struct common_chat_templates *       tmpls,
-                                       bool                                       use_jinja,
-                                       const std::map<std::string, std::string> & chat_template_kwargs);
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja,
+    const std::map<std::string, std::string> & chat_template_kwargs);

-const char *            common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
+const char*               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);

 // used by arg and server
-const char *            common_reasoning_format_name(common_reasoning_format format);
-common_reasoning_format common_reasoning_format_from_name(const std::string & format);
+const char *             common_reasoning_format_name(common_reasoning_format format);
+common_reasoning_format  common_reasoning_format_from_name(const std::string & format);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

@ -298,10 +250,3 @@ nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_

 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
-
-std::string common_chat_template_direct_apply(
-    const common_chat_template & tmpl,
-    const autoparser::templates_params & inputs,
-    const std::optional<json> & messages_override = std::nullopt,
-    const std::optional<json> & tools_override = std::nullopt,
-    const std::optional<json> & additional_context = std::nullopt);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -676,7 +676,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {

    size_t offset = 0;
    while (offset < filename.size()) {
-        utf8_parse_result result = common_parse_utf8_codepoint(filename, offset);
+        utf8_parse_result result = parse_utf8_codepoint(filename, offset);

        if (result.status != utf8_parse_result::SUCCESS) {
            return false;
--- a/common/common.h
+++ b/common/common.h
@ -104,8 +104,6 @@ enum llama_example {
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
    LLAMA_EXAMPLE_FIT_PARAMS,
-    LLAMA_EXAMPLE_RESULTS,
-    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,

    LLAMA_EXAMPLE_COUNT,
 };
@ -236,14 +234,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    // reasoning budget sampler parameters
-    // these are populated by the server/CLI based on chat template params
-    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
-    bool                     reasoning_budget_activate_immediately = false;
-    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
-    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
-    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
-
    bool backend_sampling = false;

    bool has_logit_bias() const {
@ -466,8 +456,6 @@ struct common_params {

    bool   kl_divergence    = false; // compute KL divergence

-    bool check             = false; // check rather than generate results for llama-results
-
    bool usage             = false; // print usage
    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
@ -528,15 +516,14 @@ struct common_params {
    std::string cls_sep    = "\t";  // separator of classification sequences

    // server params
-    int32_t port                = 8080;          // server listens on this network port
-    int32_t timeout_read        = 600;           // http read timeout in seconds
-    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
-    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
-    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
-    bool    cache_prompt        = true;  // whether to enable prompt caching
-    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
-    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
+    int32_t port              = 8080;         // server listens on this network port
+    int32_t timeout_read      = 600;          // http read timeout in seconds
+    int32_t timeout_write     = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
+    bool    cache_prompt      = true;         // whether to enable prompt caching
+    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
+    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@ -545,9 +532,7 @@ struct common_params {
    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
    int reasoning_budget = -1;
-    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@ -560,7 +545,6 @@ struct common_params {

    // webui configs
    bool webui = true;
-    bool webui_mcp_proxy = false;
    std::string webui_config_json;

    // "advanced" endpoints are disabled by default for better security
@ -885,7 +869,7 @@ std::string common_detokenize(
 // Embedding utils
 //

-// TODO: replace embd_norm with an enum
+// TODO: repace embd_norm with an enum
 void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);

 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -927,7 +911,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 // MoE utils
 //

-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";

 inline std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
--- a/common/console.cpp
+++ b/common/console.cpp
@ -80,8 +80,6 @@ namespace console {
    static termios      initial_state;
 #endif

-    static completion_callback completion_cb = nullptr;
-
    //
    // Init and cleanup
    //
@ -495,7 +493,7 @@ namespace console {
    }

    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
-                                  size_t & byte_pos, int cursor_byte_pos = -1) {
+                                  size_t & byte_pos) {
        move_to_line_start(char_pos, byte_pos, widths);
        clear_current_line(widths);

@ -505,7 +503,6 @@ namespace console {
        char_pos = 0;

        size_t idx = 0;
-        int back_width = 0;
        while (idx < line.size()) {
            size_t advance = 0;
            char32_t cp = decode_utf8(line, idx, advance);
@ -514,15 +511,8 @@ namespace console {
            if (real_width < 0) real_width = 0;
            widths.push_back(real_width);
            idx += advance;
-            if (cursor_byte_pos >= 0 && static_cast<size_t>(cursor_byte_pos) < idx) {
-                back_width += real_width;
-            } else {
-                ++char_pos;
-                byte_pos = idx;
-            }
-        }
-        if (cursor_byte_pos >= 0) {
-            move_cursor(-back_width);
+            ++char_pos;
+            byte_pos = idx;
        }
    }

@ -794,20 +784,6 @@ namespace console {
                break;
            }

-            if (completion_cb && input_char == '\t') {
-                auto candidates = completion_cb(line, byte_pos);
-
-                if (!candidates.empty()) {
-                    if (candidates.size() > 1 || candidates[0].first != line) {
-                        // TODO?: Display all candidates
-                        set_line_contents(candidates[0].first, line, widths, char_pos, byte_pos, candidates[0].second);
-                    } else {
-                        // TODO: Move cursor to new byte_pos
-                    }
-                    continue;
-                }
-            }
-
            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
                end_of_stream = true;
                break;
@ -1086,10 +1062,6 @@ namespace console {
        return readline_advanced(line, multiline_input);
    }

-    void set_completion_callback(completion_callback cb) {
-        completion_cb = cb;
-    }
-
    namespace spinner {
        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
        static std::condition_variable cv_stop;
--- a/common/console.h
+++ b/common/console.h
@ -4,9 +4,7 @@

 #include "common.h"

-#include <functional>
 #include <string>
-#include <vector>

 enum display_type {
    DISPLAY_TYPE_RESET = 0,
@ -23,9 +21,6 @@ namespace console {
    void set_display(display_type display);
    bool readline(std::string & line, bool multiline_input);

-    using completion_callback = std::function<std::vector<std::pair<std::string, size_t>>(std::string_view, size_t)>;
-    void set_completion_callback(completion_callback cb);
-
    namespace spinner {
        void start();
        void stop();
--- a/common/debug.h
+++ b/common/debug.h
@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
 // prints tensors that are processed in the computation graph
 // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
 // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determines whether an error should be thrown whenever a NaN is encountered
+// The template parameter determins whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
 template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
--- a/common/http.h
+++ b/common/http.h
@ -7,7 +7,6 @@ struct common_http_url {
    std::string user;
    std::string password;
    std::string host;
-    int port;
    std::string path;
 };

@ -48,20 +47,6 @@ static common_http_url common_http_parse_url(const std::string & url) {
        parts.host = rest;
        parts.path = "/";
    }
-
-    auto colon_pos = parts.host.find(':');
-
-    if (colon_pos != std::string::npos) {
-        parts.port = std::stoi(parts.host.substr(colon_pos + 1));
-        parts.host = parts.host.substr(0, colon_pos);
-    } else if (parts.scheme == "http") {
-        parts.port = 80;
-    } else if (parts.scheme == "https") {
-        parts.port = 443;
-    } else {
-        throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
-    }
-
    return parts;
 }

@ -83,7 +68,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
    }
 #endif

-    httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
+    httplib::Client cli(parts.scheme + "://" + parts.host);

    if (!parts.user.empty()) {
        cli.set_basic_auth(parts.user, parts.password);
--- a/common/jinja/README.md
+++ b/common/jinja/README.md
@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
  - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
  - **Many-to-one** (e.g., join): same as one-to-many

-For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.
+For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.

 **Enabling Input Marking:**

--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@ -1,4 +1,3 @@
-#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
@ -37,16 +36,12 @@ static void caps_try_execute(jinja::program & prog,
    auto tools = ctx.get_val("tools");

    bool success = false;
-    std::string result;
    try {
        jinja::runtime runtime(ctx);
-        auto results = runtime.execute(prog);
-        auto parts = jinja::runtime::gather_string_parts(results);
-        result = parts->as_string().str();
+        runtime.execute(prog);
        success = true;
    } catch (const std::exception & e) {
        JJ_DEBUG("Exception during execution: %s", e.what());
-        result = "";
        // ignore exceptions during capability analysis
    }

@ -95,8 +90,6 @@ caps caps_get(jinja::program & prog) {
        return v->stats.ops.find(op_name) != v->stats.ops.end();
    };

-    JJ_DEBUG("%s\n", ">>> Running capability check: typed content");
-
    // case: typed content support
    caps_try_execute(
        prog,
@ -127,7 +120,6 @@ caps caps_get(jinja::program & prog) {
        }
    );

-    JJ_DEBUG("%s\n", ">>> Running capability check: system prompt");

    // case: system prompt support
    caps_try_execute(
@ -158,9 +150,7 @@ caps caps_get(jinja::program & prog) {
        }
    );

-    JJ_DEBUG("%s\n", ">>> Running capability check: single tool support");
-
-    // case: tools support: single call
+    // case: tools support
    caps_try_execute(
        prog,
        [&]() {
@ -172,10 +162,10 @@ caps caps_get(jinja::program & prog) {
                },
                {
                    {"role", "assistant"},
-                    {"content", ""}, // Some templates expect content to be empty with tool calls
+                    {"content", "Assistant message"},
                    {"tool_calls", json::array({
                        {
-                            {"id", "call00001"},
+                            {"id", "call1"},
                            {"type", "function"},
                            {"function", {
                                {"name", "tool1"},
@ -183,18 +173,19 @@ caps caps_get(jinja::program & prog) {
                                    {"arg", "value"}
                                }}
                            }}
+                        },
+                        {
+                            {"id", "call2"},
+                            {"type", "function"},
+                            {"function", {
+                                {"name", "tool2"},
+                                {"arguments", {
+                                    {"arg", "value"}
+                                }}
+                            }}
                        }
                    })}
                },
-                {
-                    {"role", "tool"},
-                    {"content", "Tool response"},
-                    {"tool_call_id", "call00001"}
-                },
-                {
-                    {"role", "assistant"},
-                    {"content", "The tool response was 'tool response'"}
-                },
                {
                    {"role", "user"},
                    {"content", "User message"},
@ -208,7 +199,7 @@ caps caps_get(jinja::program & prog) {
                    {"name", "tool"},
                    {"type", "function"},
                    {"function", {
-                        {"name", "tool1"},
+                        {"name", "tool"},
                        {"description", "Tool description"},
                        {"parameters", {
                            {"type", "object"},
@ -233,7 +224,6 @@ caps caps_get(jinja::program & prog) {

            auto & tool_name = tools->at(0)->at("function")->at("name");
            caps_print_stats(tool_name, "tools[0].function.name");
-            caps_print_stats(tools, "tools");
            if (!tool_name->stats.used) {
                result.supports_tools = false;
            }
@ -243,93 +233,6 @@ caps caps_get(jinja::program & prog) {
            if (!tool_calls->stats.used) {
                result.supports_tool_calls = false;
            }
-        }
-    );
-
-    JJ_DEBUG("%s\n", ">>> Running capability check: parallel tool support");
-
-    // case: tools support: parallel calls
-    caps_try_execute(
-        prog,
-        [&]() {
-            // messages
-            return json::array({
-                {
-                    {"role", "user"},
-                    {"content", "User message"},
-                },
-                {
-                    {"role", "assistant"},
-                    {"content", ""}, // Some templates expect content to be empty with tool calls
-                    {"tool_calls", json::array({
-                        {
-                            {"id", "call00001"},
-                            {"type", "function"},
-                            {"function", {
-                                {"name", "tool1"},
-                                {"arguments", {
-                                    {"arg", "value"}
-                                }}
-                            }}
-                        },
-                        {
-                            {"id", "call00002"},
-                            {"type", "function"},
-                            {"function", {
-                                {"name", "tool1"},
-                                {"arguments", {
-                                    {"arg", "value"}
-                                }}
-                            }}
-                        }
-                    })}
-                },
-                {
-                    {"role", "tool"},
-                    {"content", "Tool response"},
-                    {"tool_call_id", "call00001"}
-                },
-                {
-                    {"role", "assistant"},
-                    {"content", "The tool response was 'tool response'"}
-                },
-                {
-                    {"role", "user"},
-                    {"content", "User message"},
-                },
-            });
-        },
-        [&]() {
-            // tools
-            return json::array({
-                {
-                    {"name", "tool"},
-                    {"type", "function"},
-                    {"function", {
-                        {"name", "tool1"},
-                        {"description", "Tool description"},
-                        {"parameters", {
-                            {"type", "object"},
-                            {"properties", {
-                                {"arg", {
-                                    {"type", "string"},
-                                    {"description", "Arg description"},
-                                }},
-                            }},
-                            {"required", json::array({ "arg" })},
-                        }},
-                    }},
-                },
-            });
-        },
-        [&](bool success, value & messages, value & /*tools*/) {
-            if (!success) {
-                result.supports_parallel_tool_calls = false;
-                return;
-            }
-
-            auto & tool_calls = messages->at(1)->at("tool_calls");;
-            caps_print_stats(tool_calls, "messages[1].tool_calls");

            // check for second tool call usage
            auto & tool_call_1 = tool_calls->at(1)->at("function");
@ -340,8 +243,6 @@ caps caps_get(jinja::program & prog) {
        }
    );

-    JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");
-
    // case: preserve reasoning content in chat history
    caps_try_execute(
        prog,
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@ -114,10 +114,8 @@ value binary_expression::execute_impl(context & ctx) {

    // Logical operators
    if (op.value == "and") {
-        JJ_DEBUG("Executing logical test: %s AND %s", left->type().c_str(), right->type().c_str());
        return left_val->as_bool() ? right->execute(ctx) : std::move(left_val);
    } else if (op.value == "or") {
-        JJ_DEBUG("Executing logical test: %s OR %s", left->type().c_str(), right->type().c_str());
        return left_val->as_bool() ? std::move(left_val) : right->execute(ctx);
    }

@ -840,7 +838,7 @@ value call_expression::execute_impl(context & ctx) {
    for (auto & arg_stmt : this->args) {
        auto arg_val = arg_stmt->execute(ctx);
        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
-        args.push_back(arg_val);
+        args.push_back(std::move(arg_val));
    }
    // execute callee
    value callee_val = callee->execute(ctx);
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@ -12,8 +12,8 @@
 #include <set>
 #include <sstream>
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>

 namespace jinja {

--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -27,11 +27,11 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    if (separator_rule.empty()) {
        if (min_items == 1 && !has_max) {
            return item_rule + "+";
-        }
-        if (min_items == 0 && !has_max) {
+        } else if (min_items == 0 && !has_max) {
            return item_rule + "*";
+        } else {
+            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
        }
-        return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
    }

    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
@ -41,7 +41,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }

-static void build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int64_t>::min();
    auto has_max = max_value != std::numeric_limits<int64_t>::max();

@ -128,14 +128,14 @@ static void build_min_max_int(int64_t min_value, int64_t max_value, std::strings
    if (has_min && has_max) {
        if (min_value < 0 && max_value < 0) {
            out << "\"-\" (";
-            build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
+            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
            out << ")";
            return;
        }

        if (min_value < 0) {
            out << "\"-\" (";
-            build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
+            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
            out << ") | ";
            min_value = 0;
        }
@ -159,7 +159,7 @@ static void build_min_max_int(int64_t min_value, int64_t max_value, std::strings
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@ -194,7 +194,7 @@ static void build_min_max_int(int64_t min_value, int64_t max_value, std::strings
            }
            digit_range(c, c);
            out << " (";
-            build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
+            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@ -213,10 +213,10 @@ static void build_min_max_int(int64_t min_value, int64_t max_value, std::strings
                more_digits(0, less_decimals);
                out << " | ";
            }
-            build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
+            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@ -232,7 +232,7 @@ struct BuiltinRule {
    std::vector<std::string> deps;
 };

-static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
@ -247,7 +247,7 @@ static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"null", {"\"null\" space", {}}},
 };

-static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
@ -260,26 +260,22 @@ static bool is_reserved_name(const std::string & name) {
    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
        std::unordered_set<std::string> s;
        s.insert("root");
-        for (const auto & p : PRIMITIVE_RULES) {
-            s.insert(p.first);
-        }
-        for (const auto & p : STRING_FORMAT_RULES) {
-            s.insert(p.first);
-        }
+        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
        return s;
    }();
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }

-static std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-static std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
-static std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
-static std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
+std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
+std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
+std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
 };

-static std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-static std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
@ -326,19 +322,19 @@ private:
        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
            _rules[esc_name] = rule;
            return esc_name;
+        } else {
+            int i = 0;
+            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+                i++;
+            }
+            std::string key = esc_name + std::to_string(i);
+            _rules[key] = rule;
+            return key;
        }
-        int i = 0;
-        while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
-            i++;
-        }
-        std::string key = esc_name + std::to_string(i);
-        _rules[key] = rule;
-        return key;
    }

    std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
        std::vector<std::string> rules;
-        rules.reserve(alt_schemas.size());
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
@ -402,7 +398,6 @@ private:
                flush_literal();

                std::vector<std::string> results;
-                results.reserve(ret.size());
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
@ -556,7 +551,7 @@ private:
            TrieNode() : is_end_of_string(false) {}

            void insert(const std::string & string) {
-                auto *node = this;
+                auto node = this;
                for (char c : string) {
                    node = &node->children[c];
                }
@ -681,7 +676,7 @@ private:
                if (ks.empty()) {
                    return res;
                }
-                const std::string& k = ks[0];
+                std::string k = ks[0];
                std::string kv_rule_name = prop_kv_rule_names[k];
                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
                if (first_is_optional) {
@ -784,13 +779,13 @@ public:
                        std::string pointer = ref.substr(ref.find('#') + 1);
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
-                            const std::string& sel = tokens[i];
+                            std::string sel = tokens[i];
                            if (target.is_object() && target.contains(sel)) {
                                target = target[sel];
                            } else if (target.is_array()) {
                                size_t sel_index;
                                try {
-                                    sel_index = std::stoull(sel);
+                                    sel_index = std::stoul(sel);
                                } catch (const std::invalid_argument & e) {
                                    sel_index = target.size();
                                }
@ -807,7 +802,7 @@ public:
                        _refs[ref] = target;
                    }
                } else {
-                    for (const auto & kv : n.items()) {
+                    for (auto & kv : n.items()) {
                        visit_refs(kv.value());
                    }
                }
@ -817,7 +812,7 @@ public:
        visit_refs(schema);
    }

-    static std::string _generate_constant_rule(const json & value) {
+    std::string _generate_constant_rule(const json & value) {
        return format_literal(value.dump());
    }

@ -828,12 +823,10 @@ public:

        if (schema.contains("$ref")) {
            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
-        }
-        if (schema.contains("oneOf") || schema.contains("anyOf")) {
+        } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
            std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
-        }
-        if (schema_type.is_array()) {
+        } else if (schema_type.is_array()) {
            std::vector<json> schema_types;
            for (const auto & t : schema_type) {
                json schema_copy(schema);
@ -841,18 +834,15 @@ public:
                schema_types.push_back(schema_copy);
            }
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
-        }
-        if (schema.contains("const")) {
+        } else if (schema.contains("const")) {
            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
-        }
-        if (schema.contains("enum")) {
+        } else if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
-        }
-        if ((schema_type.is_null() || schema_type == "object")
+        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
            std::unordered_set<std::string> required;
@ -873,12 +863,11 @@ public:
                _build_object_rule(
                    properties, required, name,
                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
-        }
-        if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
+        } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
            std::unordered_set<std::string> required;
            std::vector<std::pair<std::string, json>> properties;
            std::map<std::string, size_t> enum_values;
-            const std::string& hybrid_name = name;
+            std::string hybrid_name = name;
            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
                if (comp_schema.contains("$ref")) {
                    add_component(_refs[comp_schema["$ref"]], is_required);
@ -901,9 +890,9 @@ public:
                  // todo warning
                }
            };
-            for (const auto & t : schema["allOf"]) {
+            for (auto & t : schema["allOf"]) {
                if (t.contains("anyOf")) {
-                    for (const auto & tt : t["anyOf"]) {
+                    for (auto & tt : t["anyOf"]) {
                        add_component(tt, false);
                    }
                } else {
@ -922,8 +911,7 @@ public:
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
-        }
-        if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
+        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
            if (items.is_array()) {
                std::string rule = "\"[\" space ";
@ -935,31 +923,27 @@ public:
                }
                rule += " \"]\" space";
                return _add_rule(rule_name, rule);
-            }
-            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
-            int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
-            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
-            int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
+            } else {
+                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
+                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
+                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();

-            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
-        }
-        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
+                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+            }
+        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
-        }
-        if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
+        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        }
-        if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
            auto prim_name = schema_format + "-string";
            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
-        }
-        if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
-        }
-        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
+        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
@ -974,24 +958,19 @@ public:
            }
            std::stringstream out;
            out << "(";
-            build_min_max_int(min_value, max_value, out);
+            _build_min_max_int(min_value, max_value, out);
            out << ") space";
            return _add_rule(rule_name, out.str());
-        }
-        if (schema.empty() || schema_type == "object") {
+        } else if (schema.empty() || schema_type == "object") {
            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
+        } else {
+            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
+                _errors.push_back("Unrecognized schema: " + schema.dump());
+                return "";
+            }
+            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
        }
-        if (schema_type.is_null() && schema.is_object()) {
-            // No type constraint and no recognized structural keywords (e.g. {"description": "..."}).
-            // Per JSON Schema semantics this is equivalent to {} and accepts any value.
-            return _add_rule(rule_name, _add_primitive("value", PRIMITIVE_RULES.at("value")));
-        }
-        if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
-            _errors.push_back("Unrecognized schema: " + schema.dump());
-            return "";
-        }
-        // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-        return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
    }

    void check_errors() {
@ -1006,7 +985,7 @@ public:
    std::string format_grammar() {
        std::stringstream ss;
        for (const auto & kv : _rules) {
-            ss << kv.first << " ::= " << kv.second << '\n';
+            ss << kv.first << " ::= " << kv.second << std::endl;
        }
        return ss.str();
    }
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@ -1,15 +1,14 @@
-#include "peg-parser.h"
-
 #include "common.h"
+#include "peg-parser.h"
 #include "json-schema-to-grammar.h"
-#include "log.h"
 #include "unicode.h"

+#include <nlohmann/json.hpp>
+
 #include <algorithm>
 #include <initializer_list>
 #include <map>
 #include <memory>
-#include <nlohmann/json.hpp>
 #include <regex>
 #include <stdexcept>
 #include <unordered_set>
@ -35,7 +34,8 @@ static bool is_hex_digit(const char c) {
 // This is used in common_peg_until_parser and to build a GBNF exclusion grammar
 struct trie {
    struct node {
-        std::map<uint32_t, size_t> children;  // Use uint32_t to store Unicode codepoints
+        size_t depth = 0;
+        std::map<unsigned char, size_t> children;
        bool is_word;
    };

@ -55,22 +55,15 @@ struct trie {
        size_t current = 0; // Start at root
        size_t pos = start_pos;

-        // LOG_DBG("%s: checking at pos %zu, sv='%s'\n", __func__, start_pos, std::string(sv).c_str());
-
        while (pos < sv.size()) {
-            auto result = common_parse_utf8_codepoint(sv, pos);
-            if (result.status != utf8_parse_result::SUCCESS) {
-                break;
-            }
-
-            auto it = nodes[current].children.find(result.codepoint);
+            auto it = nodes[current].children.find(sv[pos]);
            if (it == nodes[current].children.end()) {
                // Can't continue matching
                return match_result{match_result::NO_MATCH};
            }

            current = it->second;
-            pos += result.bytes_consumed;
+            pos++;

            // Check if we've matched a complete word
            if (nodes[current].is_word) {
@ -89,22 +82,22 @@ struct trie {
    }

    struct prefix_and_next {
-        std::vector<uint32_t> prefix;
-        std::vector<uint32_t> next_chars;
+        std::string prefix;
+        std::string next_chars;
    };

    std::vector<prefix_and_next> collect_prefix_and_next() {
-        std::vector<uint32_t>        prefix;
+        std::string prefix;
        std::vector<prefix_and_next> result;
        collect_prefix_and_next(0, prefix, result);
        return result;
    }

  private:
-    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
+    void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
        if (!nodes[index].is_word) {
            if (!nodes[index].children.empty()) {
-                std::vector<uint32_t> chars;
+                std::string chars;
                chars.reserve(nodes[index].children.size());
                for (const auto & p : nodes[index].children) {
                    chars.push_back(p.first);
@ -114,7 +107,7 @@ struct trie {
        }

        for (const auto & p : nodes[index].children) {
-            uint32_t ch = p.first;
+            unsigned char ch = p.first;
            auto child = p.second;
            prefix.push_back(ch);
            collect_prefix_and_next(child, prefix, out);
@ -130,19 +123,11 @@ struct trie {

    void insert(const std::string & word) {
        size_t current = 0;
-        size_t pos     = 0;
-        while (pos < word.length()) {
-            auto result = common_parse_utf8_codepoint(word, pos);
-            if (result.status != utf8_parse_result::SUCCESS) {
-                break;
-            }
-
-            uint32_t ch = result.codepoint;
-            pos += result.bytes_consumed;
-
+        for (unsigned char ch : word) {
            auto it = nodes[current].children.find(ch);
            if (it == nodes[current].children.end()) {
                size_t child = create_node();
+                nodes[child].depth = nodes[current].depth + 1;
                nodes[current].children[ch] = child;
                current = child;
            } else {
@ -301,32 +286,6 @@ struct parser_executor {
    parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
        : arena(arena), ctx(ctx), start_pos(start) {}

-    std::string debug_indent() const { return std::string(ctx.parse_depth * 2, ' '); }
-
-    std::string debug_input_snippet(size_t pos, size_t len = 60) const {
-        if (pos >= ctx.input.size()) {
-            return "<EOF>";
-        }
-        auto        snippet = ctx.input.substr(pos, len);
-        // Escape newlines for display
-        std::string result;
-        for (char c : snippet) {
-            if (c == '\n') {
-                result += "\\n";
-            } else if (c == '\r') {
-                result += "\\r";
-            } else if (c == '\t') {
-                result += "\\t";
-            } else {
-                result += c;
-            }
-        }
-        if (pos + len < ctx.input.size()) {
-            result += "...";
-        }
-        return result;
-    }
-
    common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
    }
@ -349,7 +308,7 @@ struct parser_executor {
        auto pos = start_pos;
        for (auto i = 0u; i < p.literal.size(); ++i) {
            if (pos >= ctx.input.size()) {
-                if (!ctx.is_lenient()) {
+                if (!ctx.is_partial) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -364,32 +323,12 @@ struct parser_executor {
    }

    common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
-        if (ctx.is_debug()) {
-            LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos,
-                    debug_input_snippet(start_pos).c_str(), p.children.size());
-        }
-        ctx.parse_depth++;
-
        auto pos = start_pos;
        std::vector<common_peg_ast_id> nodes;

-        for (size_t i = 0; i < p.children.size(); i++) {
-            const auto & child_id = p.children[i];
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
-            }
+        for (const auto & child_id : p.children) {
            auto result = arena.parse(child_id, ctx, pos);
-
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i,
-                        common_peg_parse_result_type_name(result.type), result.start, result.end);
-            }
-
            if (result.fail()) {
-                ctx.parse_depth--;
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str());
-                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
            }

@ -398,65 +337,28 @@ struct parser_executor {
            }

            if (result.need_more_input()) {
-                ctx.parse_depth--;
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str());
-                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
            }

            pos = result.end;
        }

-        ctx.parse_depth--;
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos);
-        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
    }

    common_peg_parse_result operator()(const common_peg_choice_parser & p) {
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos,
-                    debug_input_snippet(start_pos).c_str(), p.children.size());
-        }
-        ctx.parse_depth++;
-
        auto pos = start_pos;
-        for (size_t i = 0; i < p.children.size(); i++) {
-            const auto & child_id = p.children[i];
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
-            }
+        for (const auto & child_id : p.children) {
            auto result = arena.parse(child_id, ctx, pos);
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i,
-                        common_peg_parse_result_type_name(result.type));
-            }
            if (!result.fail()) {
-                ctx.parse_depth--;
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(),
-                            common_peg_parse_result_type_name(result.type), i);
-                }
                return result;
            }
        }

-        ctx.parse_depth--;
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str());
-        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
    }

    common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos,
-                    debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count);
-        }
-        ctx.parse_depth++;
-
        auto pos = start_pos;
        int match_count = 0;
        std::vector<common_peg_ast_id> nodes;
@ -464,26 +366,14 @@ struct parser_executor {
        // Try to match up to max_count times (or unlimited if max_count is -1)
        while (p.max_count == -1 || match_count < p.max_count) {
            if (pos >= ctx.input.size()) {
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count);
-                }
                break;
            }

            auto result = arena.parse(p.child, ctx, pos);

-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count,
-                        common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size());
-                fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str());
-            }
-
            if (result.success()) {
                // Prevent infinite loop on empty matches
                if (result.end == pos) {
-                    if (ctx.is_debug()) {
-                        fprintf(stderr, "%s  REPEAT: empty match, stopping\n", debug_indent().c_str());
-                    }
                    break;
                }

@ -501,43 +391,21 @@ struct parser_executor {
                    nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
                }

-                ctx.parse_depth--;
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(),
-                            match_count, nodes.size());
-                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
            }

            // Child failed - stop trying
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str());
-            }
            break;
        }

        // Check if we got enough matches
        if (p.min_count > 0 && match_count < p.min_count) {
-            ctx.parse_depth--;
-            if (pos >= ctx.input.size() && ctx.is_lenient()) {
-                if (ctx.is_debug()) {
-                    fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(),
-                            match_count, p.min_count);
-                }
+            if (pos >= ctx.input.size() && ctx.is_partial) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
            }
-            if (ctx.is_debug()) {
-                fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count,
-                        p.min_count);
-            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
        }

-        ctx.parse_depth--;
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count,
-                    nodes.size());
-        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
    }

@ -566,10 +434,10 @@ struct parser_executor {

    common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
        // Parse a single UTF-8 codepoint (not just a single byte)
-        auto result = common_parse_utf8_codepoint(ctx.input, start_pos);
+        auto result = parse_utf8_codepoint(ctx.input, start_pos);

        if (result.status == utf8_parse_result::INCOMPLETE) {
-            if (!ctx.is_lenient()) {
+            if (!ctx.is_partial) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
@ -600,7 +468,7 @@ struct parser_executor {

        // Try to match up to max_count times (or unlimited if max_count is -1)
        while (p.max_count == -1 || match_count < p.max_count) {
-            auto result = common_parse_utf8_codepoint(ctx.input, pos);
+            auto result = parse_utf8_codepoint(ctx.input, pos);

            if (result.status == utf8_parse_result::INCOMPLETE) {
                if (match_count >= p.min_count) {
@ -608,7 +476,7 @@ struct parser_executor {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
                }
                // Not enough matches yet
-                if (!ctx.is_lenient()) {
+                if (!ctx.is_partial) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -649,7 +517,7 @@ struct parser_executor {

        // Check if we got enough matches
        if (match_count < p.min_count) {
-            if (pos >= ctx.input.size() && ctx.is_lenient()) {
+            if (pos >= ctx.input.size() && ctx.is_partial) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
@ -658,23 +526,31 @@ struct parser_executor {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
    }

-    static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos, const char delimiter) {
+    static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
        ++pos; // consume '\'
        if (pos >= ctx.input.size()) {
-            if (!ctx.is_lenient()) {
+            if (!ctx.is_partial) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
        }

-        char c = ctx.input[pos];
-        if (c == delimiter || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') {
-            ++pos;
-            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
-        } else if (c == 'u') {
-            return handle_unicode_escape(ctx, start, pos);
-        } else {
-            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
+        switch (ctx.input[pos]) {
+            case '"':
+            case '\\':
+            case '/':
+            case 'b':
+            case 'f':
+            case 'n':
+            case 'r':
+            case 't':
+                ++pos;
+                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
+            case 'u':
+                return handle_unicode_escape(ctx, start, pos);
+            default:
+                // Invalid escape sequence
+                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
        }
    }

@ -682,7 +558,7 @@ struct parser_executor {
        ++pos; // consume 'u'
        for (int i = 0; i < 4; ++i) {
            if (pos >= ctx.input.size()) {
-                if (!ctx.is_lenient()) {
+                if (!ctx.is_partial) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
@ -695,28 +571,28 @@ struct parser_executor {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
    }

-    common_peg_parse_result operator()(const common_peg_string_parser & p) {
+    common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
        auto pos = start_pos;

        // Parse string content (without quotes)
        while (pos < ctx.input.size()) {
            char c = ctx.input[pos];

-            if (c == p.delimiter) {
-                // Found closing delimiter - success (don't consume it)
+            if (c == '"') {
+                // Found closing quote - success (don't consume it)
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
            }

            if (c == '\\') {
-                auto result = handle_escape_sequence(ctx, start_pos, pos, p.delimiter);
+                auto result = handle_escape_sequence(ctx, start_pos, pos);
                if (!result.success()) {
                    return result;
                }
            } else {
-                auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
+                auto utf8_result = parse_utf8_codepoint(ctx.input, pos);

                if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
-                    if (!ctx.is_lenient()) {
+                    if (!ctx.is_partial) {
                        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                    }
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -731,7 +607,7 @@ struct parser_executor {
        }

        // Reached end without finding closing quote
-        if (!ctx.is_lenient()) {
+        if (!ctx.is_partial) {
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -745,11 +621,11 @@ struct parser_executor {
        size_t last_valid_pos = start_pos;

        while (pos < ctx.input.size()) {
-            auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
+            auto utf8_result = parse_utf8_codepoint(ctx.input, pos);

            if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
                // Incomplete UTF-8 sequence
-                if (!ctx.is_lenient()) {
+                if (!ctx.is_partial) {
                    // Input is complete but UTF-8 is incomplete = malformed
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
@ -779,7 +655,7 @@ struct parser_executor {
            last_valid_pos = pos;
        }

-        if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) {
+        if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
            // Reached the end of a partial stream, there might still be more input that we need to consume.
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
        }
@ -818,9 +694,6 @@ struct parser_executor {

    common_peg_parse_result operator()(const common_peg_tag_parser & p) {
        // Parse the child
-        if (ctx.is_debug()) {
-            fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str());
-        }
        auto result = arena.parse(p.child, ctx, start_pos);

        if (!result.fail()) {
@ -882,31 +755,6 @@ common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
    return id;
 }

-static void bfs_node(common_peg_ast_arena &arena, std::ostringstream & oss, const common_peg_ast_node & node, int indent) {
-    for (int i = 0; i < indent; i++) {
-        oss << "  ";
-    }
-    oss << "NODE " << node.id;
-    if (!node.rule.empty()) {
-        oss << " (rule " << node.rule << ")";
-    }
-    if (!node.tag.empty()) {
-        oss << " (tag " << node.tag << ")";
-    }
-    oss << " ['" << node.text << "']\n";
-    for (const auto child : node.children) {
-        bfs_node(arena, oss, arena.get(child), indent + 1);
-    }
-}
-
-std::string common_peg_ast_arena::dump() {
-    std::ostringstream oss;
-    for (auto & node : nodes_) {
-        bfs_node(*this, oss, node, 0);
-    }
-    return oss.str();
-}
-
 void common_peg_arena::resolve_refs() {
    // Walk through all parsers and replace refs with their corresponding rule IDs
    for (auto & parser : parsers_) {
@ -937,7 +785,7 @@ void common_peg_arena::resolve_refs() {
                                 std::is_same_v<T, common_peg_ref_parser> ||
                                 std::is_same_v<T, common_peg_until_parser> ||
                                 std::is_same_v<T, common_peg_literal_parser> ||
-                                 std::is_same_v<T, common_peg_string_parser> ||
+                                 std::is_same_v<T, common_peg_json_string_parser> ||
                                 std::is_same_v<T, common_peg_chars_parser> ||
                                 std::is_same_v<T, common_peg_any_parser> ||
                                 std::is_same_v<T, common_peg_space_parser>) {
@ -955,21 +803,9 @@ void common_peg_arena::resolve_refs() {
 }

 std::string common_peg_arena::dump(common_peg_parser_id id) const {
-    std::unordered_set<common_peg_parser_id> visited;
-    return dump_impl(id, visited);
-}
-
-std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
-                                        std::unordered_set<common_peg_parser_id> & visited) const {
-    // Check for cycles
-    if (visited.count(id)) {
-        return "[cycle]";
-    }
-    visited.insert(id);
-
    const auto & parser = parsers_.at(id);

-    return std::visit([this, &visited](const auto & p) -> std::string {
+    return std::visit([this](const auto & p) -> std::string {
        using T = std::decay_t<decltype(p)>;

        if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
@ -983,27 +819,24 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
        } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
            std::vector<std::string> parts;
            for (const auto & child : p.children) {
-                parts.push_back(dump_impl(child, visited));
+                parts.push_back(dump(child));
            }
            return "Sequence(" + string_join(parts, ", ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
            std::vector<std::string> parts;
            for (const auto & child : p.children) {
-                parts.push_back(dump_impl(child, visited));
+                parts.push_back(dump(child));
            }
            return "Choice(" + string_join(parts, ", ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
            if (p.max_count == -1) {
-                return "Repetition(" + dump_impl(p.child, visited) + ", " + std::to_string(p.min_count) +
-                        ", unbounded)";
+                return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
            }
-            return "Repetition(" + dump_impl(p.child, visited) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
+            return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
-            return "And(" + dump_impl(p.child, visited) + ")";
+            return "And(" + dump(p.child) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
-            return "Not(" + dump_impl(p.child, visited) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
-            return "Atomic(" + dump_impl(p.child, visited) + ")";
+            return "Not(" + dump(p.child) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
            return "Any";
        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@ -1013,20 +846,16 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
                return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
            }
            return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-            return "String(" + std::string(1, p.delimiter) + ")";
+        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+            return "JsonString()";
        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
            return "Until(" + string_join(p.delimiters, " | ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
-            return "Schema(" + dump_impl(p.child, visited) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
+            return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
-            return "Rule(" + p.name + ", " + dump_impl(p.child, visited) + ")";
+            return "Rule(" + p.name + ", " + dump(p.child) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
            return "Ref(" + p.name + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
-            return "Tag(" + p.tag + ", " + dump(p.child) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
-            return "Atomic(" + dump(p.child) + ")";
        } else {
            return "Unknown";
        }
@ -1225,32 +1054,7 @@ common_peg_arena common_peg_parser_builder::build() {
    return std::move(arena_);
 }

-// String primitives
-
-common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
-    return wrap(arena_.add_parser(common_peg_string_parser{delimiter}));
-}
-
-common_peg_parser common_peg_parser_builder::double_quoted_string() {
-    return rule("double-quoted-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
-    });
-}
-
-common_peg_parser common_peg_parser_builder::single_quoted_string() {
-    return rule("single-quoted-string", [this]() {
-        return sequence({literal("'"), string_content('\''), literal("'"), space()});
-    });
-}
-
-common_peg_parser common_peg_parser_builder::quoted_string() {
-    return rule("quoted-string", [this]() {
-        return choice({double_quoted_string(), single_quoted_string()});
-    });
-}
-
 // JSON parsers
-
 common_peg_parser common_peg_parser_builder::json_number() {
   return rule("json-number", [this]() {
        auto digit1_9 = chars("[1-9]", 1, 1);
@ -1258,17 +1062,13 @@ common_peg_parser common_peg_parser_builder::json_number() {
        auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
        auto frac = sequence({literal("."), digits});
        auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
-        // Negative lookahead: only commit the number when the next character can't extend it.
-        // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
-        // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
-        auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
-        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
+        return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
    });
 }

 common_peg_parser common_peg_parser_builder::json_string() {
    return rule("json-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\""), space()});
+        return sequence({literal("\""), json_string_content(), literal("\""), space()});
    });
 }

@ -1330,81 +1130,8 @@ common_peg_parser common_peg_parser_builder::json() {
    });
 }

-common_peg_parser common_peg_parser_builder::python_string() {
-    return rule("python-string", [this]() {
-        return choice({double_quoted_string(), single_quoted_string()});
-    });
-}
-
-common_peg_parser common_peg_parser_builder::python_number() {
-    return json_number();
-}
-
-common_peg_parser common_peg_parser_builder::python_bool() {
-    return rule("python-bool", [this]() {
-        return sequence({
-            choice({literal("True"), literal("False")}),
-            space()
-        });
-    });
-}
-
-common_peg_parser common_peg_parser_builder::python_null() {
-    return rule("python-none", [this]() {
-        return sequence({literal("None"), space()});
-    });
-}
-
-common_peg_parser common_peg_parser_builder::python_dict() {
-    return rule("python-dict", [this]() {
-        auto ws = space();
-        auto member = sequence({python_string(), ws, literal(":"), ws, python_value()});
-        auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
-        return sequence({
-            literal("{"),
-            ws,
-            choice({
-                literal("}"),
-                sequence({members, ws, literal("}")})
-            }),
-            ws
-        });
-    });
-}
-
-common_peg_parser common_peg_parser_builder::python_array() {
-    return rule("python-array", [this]() {
-        auto ws = space();
-        auto elements = sequence({python_value(), zero_or_more(sequence({literal(","), ws, python_value()}))});
-        return sequence({
-            literal("["),
-            ws,
-            choice({
-                literal("]"),
-                sequence({elements, ws, literal("]")})
-            }),
-            ws
-        });
-    });
-}
-
-common_peg_parser common_peg_parser_builder::python_value() {
-    return rule("python-value", [this]() {
-        return choice({
-            python_dict(),
-            python_array(),
-            python_string(),
-            python_number(),
-            python_bool(),
-            python_null()
-        });
-    });
-}
-
-common_peg_parser common_peg_parser_builder::marker() {
-    auto sharp_bracket_parser = literal("<") + until(">") + literal(">");
-    auto square_bracket_parser = literal("[") + until("]") + literal("]");
-    return choice({ sharp_bracket_parser, square_bracket_parser });
+common_peg_parser common_peg_parser_builder::json_string_content() {
+    return wrap(arena_.add_parser(common_peg_json_string_parser{}));
 }

 common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
@ -1418,54 +1145,17 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
    });
 }

-static std::string gbnf_escape_char_class(uint32_t c) {
-    if (c == '-' || c == ']' || c == '[' || c == '\\') {
-        return "\\" + std::string(1, (char) c);
-    }
-    // Escape whitespace control characters
-    if (c == '\n') {
-        return "\\n";
-    }
-    if (c == '\t') {
-        return "\\t";
-    }
-    if (c == '\r') {
-        return "\\r";
-    }

-    // Printable ASCII
-    if (c >= 0x20 && c <= 0x7E) {
-        return std::string(1, (char) c);
+static std::string gbnf_escape_char_class(char c) {
+    switch (c) {
+        case '\n': return "\\n";
+        case '\t': return "\\t";
+        case '\r': return "\\r";
+        case '\\': return "\\\\";
+        case ']':  return "\\]";
+        case '[':  return "\\[";
+        default:   return std::string(1, c);
    }
-
-    // Hex escape
-    char         buf[16];
-    const char * hex = "0123456789ABCDEF";
-
-    if (c <= 0xFF) {
-        buf[0] = '\\';
-        buf[1] = 'x';
-        buf[2] = hex[(c >> 4) & 0xF];
-        buf[3] = hex[c & 0xF];
-        buf[4] = '\0';
-    } else if (c <= 0xFFFF) {
-        buf[0] = '\\';
-        buf[1] = 'u';
-        buf[2] = hex[(c >> 12) & 0xF];
-        buf[3] = hex[(c >> 8) & 0xF];
-        buf[4] = hex[(c >> 4) & 0xF];
-        buf[5] = hex[c & 0xF];
-        buf[6] = '\0';
-    } else {
-        buf[0] = '\\';
-        buf[1] = 'U';
-        for (int i = 0; i < 8; i++) {
-            buf[2 + i] = hex[(c >> ((7 - i) * 4)) & 0xF];
-        }
-        buf[10] = '\0';
-    }
-
-    return std::string(buf);
 }

 static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
@ -1483,12 +1173,12 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin

        std::string cls;
        cls.reserve(chars.size());
-        for (uint32_t ch : chars) {
+        for (const auto & ch : chars) {
            cls += gbnf_escape_char_class(ch);
        }

        if (!pre.empty()) {
-            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
+            pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
        } else {
            pattern += "[^" + cls + "]";
        }
@ -1518,7 +1208,7 @@ static std::unordered_set<std::string> collect_reachable_rules(
                          std::is_same_v<T, common_peg_chars_parser> ||
                          std::is_same_v<T, common_peg_space_parser> ||
                          std::is_same_v<T, common_peg_any_parser> ||
-                          std::is_same_v<T, common_peg_string_parser>) {
+                          std::is_same_v<T, common_peg_json_string_parser>) {
                // These parsers do not have any children
            } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
                for (auto child : p.children) {
@ -1654,9 +1344,8 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                    return result + "{" + std::to_string(p.min_count) + "}";
                }
                return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
-            } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-                const std::string delim(1, p.delimiter);
-                return R"(( [^)" + delim + R"(\\] | "\\" ( [)" + delim + R"(\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
+            } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+                return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
            } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
                if (p.delimiters.empty()) {
                    return ".*";
@ -1786,8 +1475,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
                {"min_count", p.min_count},
                {"max_count", p.max_count}
            };
-        } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-            return json{{"type", "string"}, {"delimiter", std::string(1, p.delimiter)}};
+        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+            return json{{"type", "json_string"}};
        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
            return json{{"type", "until"}, {"delimiters", p.delimiters}};
        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
@ -1914,15 +1603,8 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
        }
        return parser;
    }
-    if (type == "string") {
-        if (!j.contains("delimiter")) {
-            throw std::runtime_error("string parser missing delimiter field.");
-        }
-        std::string delimiter = j["delimiter"];
-        if (delimiter.empty()) {
-            throw std::runtime_error("string parser delimiter is empty.");
-        }
-        return common_peg_string_parser{delimiter[0]};
+    if (type == "json_string") {
+        return common_peg_json_string_parser{};
    }
    if (type == "until") {
        if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@ -4,7 +4,6 @@

 #include <memory>
 #include <unordered_map>
-#include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@ -112,8 +111,6 @@ class common_peg_ast_arena {

    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
-
-    std::string dump();
 };

 struct common_peg_parse_result {
@ -139,43 +136,21 @@ struct common_peg_parse_result {
    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
 };

-enum common_peg_parse_flags {
-    COMMON_PEG_PARSE_FLAG_NONE    = 0,
-    COMMON_PEG_PARSE_FLAG_LENIENT = 1 << 0,
-    COMMON_PEG_PARSE_FLAG_DEBUG   = 1 << 1,
-};
-
-inline common_peg_parse_flags operator|(common_peg_parse_flags a, common_peg_parse_flags b) {
-    return static_cast<common_peg_parse_flags>(int(a) | int(b));
-}
-
-inline common_peg_parse_flags & operator|=(common_peg_parse_flags & a, common_peg_parse_flags b) {
-    return a = a | b;
-}
-
-inline common_peg_parse_flags operator&(common_peg_parse_flags a, common_peg_parse_flags b) {
-    return static_cast<common_peg_parse_flags>(int(a) & int(b));
-}
-
-inline common_peg_parse_flags operator~(common_peg_parse_flags a) {
-    return static_cast<common_peg_parse_flags>(~int(a));
-}
-
 struct common_peg_parse_context {
    std::string input;
-    common_peg_parse_flags flags;
+    bool is_partial;
    common_peg_ast_arena ast;

    int parse_depth;

-    common_peg_parse_context(common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
-        : flags(flags), parse_depth(0) {}
+    common_peg_parse_context()
+        : is_partial(false), parse_depth(0) {}

-    common_peg_parse_context(const std::string & input, common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
-        : input(input), flags(flags), parse_depth(0) {}
+    common_peg_parse_context(const std::string & input)
+        : input(input), is_partial(false), parse_depth(0) {}

-    bool is_lenient() const { return flags & COMMON_PEG_PARSE_FLAG_LENIENT; }
-    bool is_debug() const { return flags & COMMON_PEG_PARSE_FLAG_DEBUG; }
+    common_peg_parse_context(const std::string & input, bool is_partial)
+        : input(input), is_partial(is_partial), parse_depth(0) {}
 };

 class common_peg_arena;
@ -231,9 +206,7 @@ struct common_peg_chars_parser {
    int max_count;  // -1 for unbounded
 };

-struct common_peg_string_parser {
-    char delimiter;
-};
+struct common_peg_json_string_parser {};

 struct common_peg_until_parser {
    std::vector<std::string> delimiters;
@ -281,7 +254,7 @@ using common_peg_parser_variant = std::variant<
    common_peg_any_parser,
    common_peg_space_parser,
    common_peg_chars_parser,
-    common_peg_string_parser,
+    common_peg_json_string_parser,
    common_peg_until_parser,
    common_peg_schema_parser,
    common_peg_rule_parser,
@ -326,8 +299,6 @@ class common_peg_arena {
    friend class common_peg_parser_builder;

  private:
-    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
-
    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);

@ -433,18 +404,6 @@ class common_peg_parser_builder {
    //   S -> A{n}
    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }

-    // Matches a double-quoted string: '"' content '"' space
-    common_peg_parser double_quoted_string();
-
-    // Matches a single-quoted string: "'" content "'" space
-    common_peg_parser single_quoted_string();
-
-    // Matches a string that accepts both double-quoted and single-quoted styles.
-    common_peg_parser quoted_string();
-
-    // Matches string content without the surrounding delimiter.
-    common_peg_parser string_content(char delimiter);
-
    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
    //   value -> object | array | string | number | true | false | null
    common_peg_parser json();
@ -455,24 +414,14 @@ class common_peg_parser_builder {
    common_peg_parser json_bool();
    common_peg_parser json_null();

+    // Matches JSON string content without the surrounding quotes.
+    // Useful for extracting content within a JSON string.
+    common_peg_parser json_string_content();
+
    // Matches a JSON object member with a key and associated parser as the
    // value.
    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);

-    // Creates a complete Python format parser supporting dicts, arrays, strings, numbers, booleans, and None.
-    // Differs from JSON: uses True/False/None, accepts both single and double-quoted strings.
-    //   value -> dict | array | string | number | True | False | None
-    common_peg_parser python_value();
-    common_peg_parser python_dict();
-    common_peg_parser python_string();
-    common_peg_parser python_array();
-    common_peg_parser python_number();
-    common_peg_parser python_bool();
-    common_peg_parser python_null();
-
-    // A marker, i.e. text delimited by a pair of <> or []
-    common_peg_parser marker();
-
    // Wraps a parser with JSON schema metadata for grammar generation.
    // Used internally to convert JSON schemas to GBNF grammar rules.
    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@ -1,219 +0,0 @@
-#include "reasoning-budget.h"
-#include "common.h"
-#include "unicode.h"
-
-#include "log.h"
-
-#include <cmath>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-struct token_matcher {
-    std::vector<llama_token> tokens;
-    size_t pos = 0;
-
-    bool advance(llama_token token) {
-        if (tokens.empty()) {
-            return false;
-        }
-
-        if (token == tokens[pos]) {
-            pos++;
-            if (pos >= tokens.size()) {
-                pos = 0;
-                return true;
-            }
-        } else {
-            pos = 0;
-            if (token == tokens[0]) {
-                pos = 1;
-            }
-        }
-        return false;
-    }
-
-    void reset() { pos = 0; }
-};
-
-struct common_reasoning_budget_ctx {
-    const llama_vocab * vocab;
-
-    token_matcher start_matcher;
-    token_matcher end_matcher;
-    std::vector<llama_token> forced_tokens;
-
-    int32_t budget;           // maximum tokens in reasoning block
-    int32_t remaining;        // tokens remaining in budget
-
-    common_reasoning_budget_state state;
-
-    // for forcing
-    size_t force_pos;         // next position in forced_tokens to force
-};
-
-static const char * common_reasoning_budget_name(const struct llama_sampler * /*smpl*/) {
-    return "reasoning-budget";
-}
-
-static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_token token) {
-    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
-
-    switch (ctx->state) {
-        case REASONING_BUDGET_IDLE:
-        {
-            if (ctx->start_matcher.advance(token)) {
-                ctx->state = REASONING_BUDGET_COUNTING;
-                ctx->remaining = ctx->budget;
-                LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
-
-                if (ctx->remaining <= 0) {
-                    ctx->state = REASONING_BUDGET_FORCING;
-                    ctx->force_pos = 0;
-                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
-                }
-            }
-            break;
-        }
-        case REASONING_BUDGET_COUNTING:
-        case REASONING_BUDGET_WAITING_UTF8:
-        {
-            if (ctx->end_matcher.advance(token)) {
-                ctx->state = REASONING_BUDGET_DONE;
-                LOG_INF("reasoning-budget: deactivated (natural end)\n");
-                break;
-            }
-
-            bool utf8_complete = true;
-            if (ctx->vocab != nullptr) {
-                const std::string piece = common_token_to_piece(ctx->vocab, token, false);
-                utf8_complete = common_utf8_is_complete(piece);
-            }
-
-            if (ctx->state == REASONING_BUDGET_WAITING_UTF8) {
-                if (utf8_complete) {
-                    ctx->state = REASONING_BUDGET_FORCING;
-                    ctx->force_pos = 0;
-                    ctx->end_matcher.reset();
-                    LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
-                }
-            } else if (ctx->state == REASONING_BUDGET_COUNTING) {
-                ctx->remaining--;
-                if (ctx->remaining <= 0) {
-                    if (utf8_complete) {
-                        ctx->state = REASONING_BUDGET_FORCING;
-                        ctx->force_pos = 0;
-                        ctx->end_matcher.reset();
-                        LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
-                    } else {
-                        ctx->state = REASONING_BUDGET_WAITING_UTF8;
-                        ctx->end_matcher.reset();
-                        LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
-                    }
-                }
-            }
-            break;
-        }
-        case REASONING_BUDGET_FORCING:
-            // force_pos is advanced in apply(), not here.
-            // This ensures the first forced token isn't skipped when the sampler
-            // is initialized directly in FORCING state (e.g. COUNTING + budget=0)
-            break;
-        case REASONING_BUDGET_DONE:
-            break;
-    }
-}
-
-static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
-
-    if (ctx->state != REASONING_BUDGET_FORCING) {
-        // passthrough — don't modify logits
-        return;
-    }
-
-    if (ctx->force_pos >= ctx->forced_tokens.size()) {
-        return;
-    }
-
-    const llama_token forced = ctx->forced_tokens[ctx->force_pos];
-
-    // set all logits to -inf except the forced token
-    for (size_t i = 0; i < cur_p->size; i++) {
-        if (cur_p->data[i].id != forced) {
-            cur_p->data[i].logit = -INFINITY;
-        }
-    }
-
-    // advance to next forced token (done here rather than in accept so that
-    // the first forced token isn't skipped when starting in FORCING state)
-    ctx->force_pos++;
-    if (ctx->force_pos >= ctx->forced_tokens.size()) {
-        ctx->state = REASONING_BUDGET_DONE;
-        LOG_INF("reasoning-budget: forced sequence complete, done\n");
-    }
-}
-
-static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
-    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
-    ctx->state = REASONING_BUDGET_IDLE;
-    ctx->remaining = ctx->budget;
-    ctx->start_matcher.reset();
-    ctx->end_matcher.reset();
-    ctx->force_pos = 0;
-}
-
-static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
-    return common_reasoning_budget_init(
-        ctx->vocab,
-        ctx->start_matcher.tokens,
-        ctx->end_matcher.tokens,
-        ctx->forced_tokens,
-        ctx->budget,
-        ctx->state);
-}
-
-static void common_reasoning_budget_free(struct llama_sampler * smpl) {
-    delete (common_reasoning_budget_ctx *) smpl->ctx;
-}
-
-static struct llama_sampler_i common_reasoning_budget_i = {
-    /* .name              = */ common_reasoning_budget_name,
-    /* .accept            = */ common_reasoning_budget_accept,
-    /* .apply             = */ common_reasoning_budget_apply,
-    /* .reset             = */ common_reasoning_budget_reset,
-    /* .clone             = */ common_reasoning_budget_clone,
-    /* .free              = */ common_reasoning_budget_free,
-    /* .backend_init      = */ nullptr,
-    /* .backend_accept    = */ nullptr,
-    /* .backend_apply     = */ nullptr,
-    /* .backend_set_input = */ nullptr,
-};
-
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        common_reasoning_budget_state    initial_state) {
-    // promote COUNTING with budget <= 0 to FORCING
-    if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
-        initial_state = REASONING_BUDGET_FORCING;
-    }
-
-    return llama_sampler_init(
-        /* .iface = */ &common_reasoning_budget_i,
-        /* .ctx   = */ new common_reasoning_budget_ctx {
-            /* .vocab         = */ vocab,
-            /* .start_matcher = */ { start_tokens, 0 },
-            /* .end_matcher   = */ { end_tokens, 0 },
-            /* .forced_tokens = */ forced_tokens,
-            /* .budget        = */ budget,
-            /* .remaining     = */ budget,
-            /* .state         = */ initial_state,
-            /* .force_pos     = */ 0,
-        }
-    );
-}
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@ -1,41 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include <cstdint>
-#include <vector>
-
-enum common_reasoning_budget_state {
-    REASONING_BUDGET_IDLE,         // waiting for start sequence
-    REASONING_BUDGET_COUNTING,     // counting down tokens
-    REASONING_BUDGET_FORCING,      // forcing budget message + end sequence
-    REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
-    REASONING_BUDGET_DONE,         // passthrough forever
-};
-
-// Creates a reasoning budget sampler that limits token generation inside a
-// reasoning block (e.g. between <think> and </think>).
-//
-// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
-//   IDLE:         passthrough, watching for start_tokens sequence
-//   COUNTING:     counting down remaining tokens, watching for natural end_tokens
-//   WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
-//   FORCING:      forces forced_tokens token-by-token (all other logits -> -inf)
-//   DONE:         passthrough forever
-//
-// Parameters:
-//   vocab         - vocabulary (used for UTF-8 boundary detection; can be nullptr)
-//   start_tokens  - token sequence that activates counting
-//   end_tokens    - token sequence for natural deactivation
-//   forced_tokens - token sequence forced when budget expires
-//   budget        - max tokens allowed in the reasoning block
-//   initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
-//                   note: COUNTING with budget <= 0 is promoted to FORCING
-//
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        common_reasoning_budget_state    initial_state);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,7 +2,6 @@

 #include "common.h"
 #include "log.h"
-#include "reasoning-budget.h"

 #include <algorithm>
 #include <cmath>
@ -251,17 +250,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }

-    // reasoning budget sampler — added first so it can force tokens before other samplers
-    if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
-        samplers.push_back(common_reasoning_budget_init(
-            vocab,
-            params.reasoning_budget_start,
-            params.reasoning_budget_end,
-            params.reasoning_budget_forced,
-            params.reasoning_budget_tokens,
-            params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
-    }
-
    if (params.has_logit_bias()) {
        samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
    }
--- a/common/unicode.cpp
+++ b/common/unicode.cpp
@ -1,20 +1,14 @@
 #include "unicode.h"

-#include <algorithm>
-#include <cassert>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
 // implementation adopted from src/unicode.cpp

-size_t common_utf8_sequence_length(unsigned char first_byte) {
+size_t utf8_sequence_length(unsigned char first_byte) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
    return lookup[highbits];
 }

-utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
    if (offset >= input.size()) {
        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
    }
@ -68,57 +62,3 @@ utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t off
    // Invalid first byte
    return utf8_parse_result(utf8_parse_result::INVALID);
 }
-
-bool common_utf8_is_complete(const std::string & s) {
-    if (s.empty()) {
-        return true;
-    }
-    for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
-        unsigned char c = s[s.size() - i];
-        if ((c & 0xC0) != 0x80) {
-            int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
-            return i >= expected;
-        }
-    }
-    return false;
-}
-
-std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
-    std::string result;
-    for (size_t i = 0; i < cps.size(); ++i) {
-        result.append(common_unicode_cpt_to_utf8(cps[i]));
-    }
-    return result;
-}
-
-std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
-    std::string result;
-
-    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
-        result.push_back(cpt);
-        return result;
-    }
-    if (0x80 <= cpt && cpt <= 0x7ff) {
-        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
-        result.push_back(0x80 | (cpt & 0x3f));
-        return result;
-    }
-    if (0x800 <= cpt && cpt <= 0xffff) {
-        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
-        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
-        result.push_back(0x80 | (cpt & 0x3f));
-        return result;
-    }
-    if (0x10000 <= cpt && cpt <= 0x10ffff) {
-        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
-        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
-        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
-        result.push_back(0x80 | (cpt & 0x3f));
-        return result;
-    }
-
-    throw std::invalid_argument("invalid codepoint");
-}
-
-
-
--- a/common/unicode.h
+++ b/common/unicode.h
@ -2,8 +2,6 @@

 #include <cstdint>
 #include <string_view>
-#include <vector>
-#include <string>

 // UTF-8 parsing utilities for streaming-aware unicode support

@ -18,13 +16,7 @@ struct utf8_parse_result {

 // Determine the expected length of a UTF-8 sequence from its first byte
 // Returns 0 for invalid first bytes
-size_t common_utf8_sequence_length(unsigned char first_byte);
-
-// Check if a string ends with a complete UTF-8 sequence.
-bool common_utf8_is_complete(const std::string & s);
+size_t utf8_sequence_length(unsigned char first_byte);

 // Parse a single UTF-8 codepoint from input
-utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);
-
-std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps);
-std::string common_unicode_cpt_to_utf8(uint32_t cpt);
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -144,7 +144,6 @@ class ModelBase:
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
-        self._is_nvfp4 = False

        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -272,9 +271,6 @@ class ModelBase:
        return tensors

    def dequant_model(self):
-        if self._is_nvfp4:
-            return  # NVFP4 weights are repacked in _generate_nvfp4_tensors
-
        tensors_to_remove: list[str] = []
        new_tensors: dict[str, Callable[[], Tensor]] = {}

@ -520,13 +516,6 @@ class ModelBase:
        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
-        if self._is_nvfp4:
-            if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
-                return []
-            if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
-                return []
-
        new_name = self.map_tensor_name(name)

        # Handle gate/up expert tensor fusion if enabled
@ -562,135 +551,9 @@ class ModelBase:
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        return ()

-    @staticmethod
-    def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
-        """Repack NVFP4 ModelOpt tensors into ggml super-block layout.
-        Preserves original E4M3 scale bits as UE4M3 (strip sign bit).
-        The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul().
-        Returns (raw_data, logical_shape)."""
-
-        out_features = weight.shape[0]
-        n_blocks = scale.shape[1]
-
-        # Unpack ModelOpt nibble-packed weights
-        w = weight.reshape(out_features, n_blocks, 8)
-        vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16)
-
-        # Preserve original E4M3 scale bits as UE4M3 (strip sign bit)
-        d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F
-        qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy()
-
-        # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements
-        n_super = n_blocks // 4
-        d_grouped = d_ue.reshape(out_features, n_super, 4)
-        qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32)
-        raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
-        return raw, [out_features, n_super * 64]
-
-    @staticmethod
-    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
-        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
-
-    def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
-        raw, shape = self._nvfp4_pack(weight, scale)
-        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
-        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
-
-        # Emit per-tensor scale2 as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(scale2):
-            scale2_f32 = scale2.float().numpy().flatten()
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
-            self.gguf_writer.add_tensor(scale_name, scale2_f32)
-
-    def _generate_nvfp4_tensors(self):
-        # Per-layer expert merging to avoid holding all experts in memory
-        expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
-        expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
-        expert_shapes: dict[tuple[int, str], list[int]] = {}
-        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
-
-        for name in list(self.model_tensors.keys()):
-            if not name.endswith(".weight"):
-                continue
-            scale_name = name.replace(".weight", ".weight_scale")
-            scale2_name = name.replace(".weight", ".weight_scale_2")
-            if scale_name not in self.model_tensors:
-                continue
-            # Force eager materialization of lazy tensors
-            weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
-            scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
-            scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
-
-            # Check if this is a per-expert tensor
-            m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
-            if m:
-                expert_id = int(m.group(1))
-                proj_type = m.group(2)
-                bid_m = re.search(r'\.layers\.(\d+)\.', name)
-                bid = int(bid_m.group(1)) if bid_m else 0
-                key = (bid, proj_type)
-
-                raw, shape = self._nvfp4_pack(weight, scale)
-
-                if key not in expert_blocks:
-                    expert_blocks[key] = []
-                    expert_scales[key] = []
-                    expert_shapes[key] = shape
-                expert_blocks[key].append((expert_id, raw.copy()))
-                # Collect per-expert scale2 (scalar per expert)
-                expert_scales[key].append((expert_id, float(scale2.float().sum())))
-
-                # Flush when all experts for this (layer, proj) are collected
-                if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
-                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
-            else:
-                new_name = self.map_tensor_name(name)
-                self._repack_nvfp4(new_name, weight, scale, scale2)
-
-        # Flush any remaining experts (fallback if n_experts was unknown)
-        for (bid, proj_type) in list(expert_blocks.keys()):
-            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
-
-    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
-        experts = expert_blocks.pop(key)
-        scales = expert_scales.pop(key)
-        shape = expert_shapes.pop(key)
-
-        experts.sort(key=lambda x: x[0])
-        merged = np.stack([e[1] for e in experts], axis=0)
-        merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight"
-        new_name = self.map_tensor_name(merged_name)
-        logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
-        self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
-
-        # Emit per-expert scale2 tensor if any expert has non-trivial scale2
-        scales.sort(key=lambda x: x[0])
-        scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
-        if not np.allclose(scale_vals, 1.0, atol=1e-6):
-            scale_name = new_name.replace(".weight", ".scale")
-            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
-            self.gguf_writer.add_tensor(scale_name, scale_vals)
-
-        del experts, merged
-
    def prepare_tensors(self):
-        # detect NVFP4 quantization (ModelOpt format)
-        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
-        quant_config_file = self.dir_model / "hf_quant_config.json"
-
-        if not quant_algo and quant_config_file.is_file():
-            with open(quant_config_file, "r", encoding="utf-8") as f:
-                quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
-
-        self._is_nvfp4 = quant_algo == "NVFP4"
-
        self.dequant_model()

-        # NVFP4 weights are repacked and written directly to gguf_writer
-        if self._is_nvfp4:
-            self._generate_nvfp4_tensors()
-
        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
        if self.tensor_map.mapping:
            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@ -4168,7 +4031,7 @@ class Qwen2VLVisionModel(MmprojModel):
                # split Conv3D into Conv2Ds
                c1, c2, kt, kh, kw = data_torch.shape
                del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
+                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
            else:
@ -4440,14 +4303,6 @@ class Qwen2MoeModel(TextModel):
        # process the experts separately
        name = name.replace("language_model.", "") # InternVL

-        # NVFP4 expert weights are handled in _generate_nvfp4_tensors
-        if self._is_nvfp4 and "experts" in name:
-            if name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
-                if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
-                    return
-                if not name.endswith(".weight"):
-                    return
-
        # handle aggregated expert tensors
        # GGUF stores dimensions reversed from PyTorch, so:
        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
@ -4535,31 +4390,15 @@ class Qwen3Model(Qwen2Model):
        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

-        if self._is_qwen3_reranker():
-            self._find_rerank_config()
-
-    def _is_qwen3_reranker(self) -> bool:
+        # a bit hacky, but currently the only way to detect if this is a rerank model
+        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
        readme_path = self.dir_model / "README.md"
        readme_text = ""
        if readme_path.exists():
            with readme_path.open("r", encoding="utf-8") as f:
                readme_text = f.read()
-
-        name_hints = [
-            str(self.dir_model.name),
-            str(self.hparams.get("_name_or_path", "")),
-            str(self.hparams.get("model_type", "")),
-            str(self.origin_hf_arch or ""),
-        ]
-        name_hints = [hint.lower() for hint in name_hints if hint]
-
-        if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower():
-            return True
-
-        if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints):
-            return True
-
-        return "sequenceclassification" in (self.origin_hf_arch or "").lower()
+        if "# Qwen3-Reranker" in readme_text:
+            self._find_rerank_config()

    def set_vocab(self):
        # deal with intern-s1-mini
@ -5003,12 +4842,12 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
+@ModelBase.register("Qwen3_5ForConditionalGeneration")
 class Qwen3_5TextModel(_LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35


-@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
+@ModelBase.register("Qwen3_5MoeForConditionalGeneration")
 class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35MOE

@ -5062,7 +4901,7 @@ class Phi2Model(TextModel):
        self.gguf_writer.add_add_bos_token(False)


-@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
+@ModelBase.register("Phi3ForCausalLM")
 class Phi3MiniModel(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI3

@ -5237,129 +5076,6 @@ class Phi3MiniModel(TextModel):
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
-            return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Phi4ForCausalLMV")
-class Phi4VisionMmprojModel(MmprojModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-
-        self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
-        if self.vision_total_layers < 2:
-            raise ValueError(
-                f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
-            )
-
-        # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
-        # drop post-layernorm/head weights. This makes the GGUF runtime output match
-        # the feature map consumed by the patched siglip.cpp Phi-4 projector path.
-        self.vision_export_layers = self.vision_total_layers - 1
-        self.vision_last_layer_idx = self.vision_total_layers - 1
-
-        for key in self.n_block_keys:
-            if key in self.hparams_vision:
-                self.hparams_vision[key] = self.vision_export_layers
-                break
-
-        self.block_count = self.vision_export_layers
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
-
-        patch_size = self.preprocessor_config.get("patch_size")
-        if patch_size is None:
-            raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
-
-        self.hparams_vision["patch_size"] = patch_size
-
-        pos_emb_name = next(
-            (
-                name for name in self.model_tensors
-                if name.endswith("vision_model.embeddings.position_embedding.weight")
-            ),
-            None,
-        )
-        if pos_emb_name is None:
-            raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
-
-        pos_emb_shape = self.model_tensors[pos_emb_name]().shape
-        base_grid_tokens = int(pos_emb_shape[0])
-        grid_side = math.isqrt(base_grid_tokens)
-        if grid_side * grid_side != base_grid_tokens:
-            raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
-
-        self.hparams_vision["image_size"] = grid_side * patch_size
-
-        min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
-        max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
-        if min_num_patches is None or max_num_patches is None:
-            raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
-
-        self.min_pixels = int(min_num_patches) * patch_size * patch_size
-        self.max_pixels = int(max_num_patches) * patch_size * patch_size
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        assert self.hparams_vision is not None
-
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
-        self.gguf_writer.add_vision_min_pixels(self.min_pixels)
-        self.gguf_writer.add_vision_max_pixels(self.max_pixels)
-        self.gguf_writer.add_vision_use_gelu(True)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
-            if ".vision_model.head." in name:
-                return
-
-            new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
-
-            if ".vision_model.post_layernorm." in new_name:
-                return
-
-            if bid is not None and bid == self.vision_last_layer_idx:
-                return
-
-            if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
-                assert self.hparams_vision is not None
-                if data_torch.ndim != 2:
-                    raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
-
-                patch_area = self.hparams_vision["patch_size"] ** 2
-                in_features = data_torch.shape[1]
-                if in_features % patch_area != 0:
-                    raise ValueError(
-                        f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
-                    )
-
-                num_channels = in_features // patch_area
-                patch_size = self.hparams_vision["patch_size"]
-                data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
-                data_torch = data_torch.permute(0, 3, 1, 2)
-
-            yield from super().modify_tensors(data_torch, new_name, bid)
-            return
-
-        if name.startswith(("model.mm_projector.", "mm_projector.")):
-            local_name = name
-            local_name = local_name.replace("model.mm_projector.", "")
-            local_name = local_name.replace("mm_projector.", "")
-
-            if not (local_name.startswith("0.") or local_name.startswith("2.")):
-                return
-
-            suffix = ".bias" if local_name.endswith(".bias") else ".weight"
-            mm_idx = int(local_name.split(".", maxsplit=1)[0])
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
-            return
-
-        return
-

@ModelBase.register("PhiMoEForCausalLM")
 class PhiMoeModel(Phi3MiniModel):
@ -5688,7 +5404,7 @@ class KimiLinearModel(TextModel):
        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
        linear_attn_config = self.hparams["linear_attn_config"]
        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
-        # full_attention_layers list will be used to distinguish layer type
+        # full_attention_layers list will be used to distingush layer type
        _num_kv_heads = list()
        _full_attn_layers = linear_attn_config["full_attn_layers"]
        for il in range(self.hparams["num_hidden_layers"]):
@ -6789,7 +6505,7 @@ class Gemma3VisionModel(MmprojModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
-        # default values below are taken from HF transformers code
+        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
        # calculate proj_scale_factor (used by tinygemma3 test model)
@ -7381,7 +7097,7 @@ class Rwkv7Model(TextModel):

            if bid == 0 and "time_mix_a" in new_name:
                # dummy v0/v1/v2 on first layer
-                # easiest way to make llama happy
+                # easist way to make llama happy
                yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)

            yield (new_name, data_torch)
@ -9880,7 +9596,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
        # NOTE: Explicitly include hparam prefix prefix for d_model to
        #   disambiguate with top-level head_dim
        # NOTE 2: If needed for future models, this can be isolated in a method
-        #   to separate the prefix setting and the keys used
+        #   to separate the prefix setting and teh keys used
        self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
        self.n_group = self.find_hparam(["n_groups", "num_groups"])
        self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
@ -10011,38 +9727,23 @@ class NemotronHModel(GraniteHybridModel):
        # M: Mamba2, *: Attention, -: MLP
        # MoE:
        # M: Mamba2, *: Attention, E: Expert
-        pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
-        if pattern is None:
-            self._ssm_layers = []
-            self._mlp_layers = []
-        elif isinstance(pattern, str):
-            self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"]
-            self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")]
-        else:
-            self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"]
-            self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"]
+        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
+        self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
+        self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]

    def get_attn_layers(self):
-        pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
-        if pattern is None:
-            return []
-        assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!"
-        if isinstance(pattern, str):
-            return [i for i, val in enumerate(pattern) if val == "*"]
-
-        return [i for i, val in enumerate(pattern) if val == "attention"]
+        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
+        assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
+        return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]

    def set_gguf_parameters(self):
        super().set_gguf_parameters()

-        head_dim = self.head_dim
-        if head_dim is None:
-            raise ValueError("Could not find the attention head dim in config")
-        self.gguf_writer.add_key_length(head_dim)
-        self.gguf_writer.add_value_length(head_dim)
+        self.gguf_writer.add_key_length(self.head_dim)
+        self.gguf_writer.add_value_length(self.head_dim)

        # Set feed_forward_length
-        # NOTE: This will trigger an override warning. This is preferable to
+        # NOTE: This will trigger an override warning. This is preferrable to
        #   duplicating all the parent logic
        if not self.is_moe:
            n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
@ -10067,9 +9768,6 @@ class NemotronHModel(GraniteHybridModel):
            if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
                self.gguf_writer.add_expert_used_count(n_experts_used)

-            if (latent_size := self.hparams.get("moe_latent_size")) is not None:
-                self.gguf_writer.add_moe_latent_size(latent_size)
-
    def set_vocab(self):
        super().set_vocab()

@ -10089,13 +9787,6 @@ class NemotronHModel(GraniteHybridModel):
            name = name[len("language_model."):]

        if self.is_moe and bid is not None:
-            # Skip Multi-Token Prediction (MTP) tensors. These are used for
-            # for speculative decoding but we don't include them in this model
-            # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886
-            if name.startswith("mtp."):
-                logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}")
-                return
-
            if name.endswith("mixer.gate.e_score_correction_bias"):
                new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
                yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@ -1,525 +0,0 @@
-# Auto-Parser Architecture
-
-The auto-parser automatically analyzes chat templates to determine how to parse model outputs, including content, reasoning, and tool calls.
-
-## Overview
-
-The unified auto-parser uses a pure differential, compositional approach (inspired by the `git diff` algorithm) to analyze chat templates:
-
-**Core Philosophy**:
-
- **Minimize Hardcoded Patterns**: All markers extracted through template comparison (the only heuristic is JSON detection to distinguish `JSON_NATIVE` from tag-based formats)
- **Compositional Architecture**: Separate analyzer structs for reasoning, content, and tools — each responsible for its own analysis and parser construction
-
-**Analysis + Parser Building in Two Steps**:
-
-1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs
-2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
-
-## Data Structures
-
-All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h).
-
-### Top-Level: `autoparser` (main analyzer and generator)
-
-[common/chat-auto-parser.h:367-388](common/chat-auto-parser.h#L367-L388) — top-level analysis result aggregating `jinja_caps`, `reasoning`, `content`, and `tools` sub-analyses, plus `preserved_tokens` (union of all non-empty markers).
-
-### `analyze_reasoning`
-
-[common/chat-auto-parser.h:254-274](common/chat-auto-parser.h#L254-L274) — reasoning analysis result: `mode` enum, `start` marker (e.g. `<think>`), and `end` marker (e.g. `</think>`).
-
-### `analyze_content`
-
-[common/chat-auto-parser.h:280-295](common/chat-auto-parser.h#L280-L295) — content analysis result: `mode` enum, `start`/`end` markers, and `requires_nonnull_content` flag.
-
-### `analyze_tools` and its sub-structs
-
- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`)
- [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names
- [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator`
- [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values
- [common/chat-auto-parser.h:301-361](common/chat-auto-parser.h#L301-L361) — `analyze_tools`: aggregates the four sub-structs above
-
-### Enums
-
-**`reasoning_mode`**: How the template handles reasoning/thinking blocks.
-
-| Value           | Description                                                                       |
-|-----------------|-----------------------------------------------------------------------------------|
-| `NONE`          | No reasoning markers detected                                                     |
-| `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
-| `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
-| `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
-| `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |
-
-**`content_mode`**: How the template wraps assistant content.
-
-| Value                    | Description                                                    |
-|--------------------------|----------------------------------------------------------------|
-| `PLAIN`                  | No content markers                                             |
-| `ALWAYS_WRAPPED`         | Content always wrapped: `<response>...</response>`             |
-| `WRAPPED_WITH_REASONING` | Content wrapped only when reasoning is present                 |
-
-**`tool_format`**: Classification of tool call structure.
-
-| Value            | Description                                                      |
-|------------------|------------------------------------------------------------------|
-| `NONE`           | No tool support detected                                         |
-| `JSON_NATIVE`    | Pure JSON: `{"name": "X", "arguments": {...}}`                   |
-| `TAG_WITH_JSON`  | Tag-based with JSON args: `<function=X>{...}</function>`         |
-| `TAG_WITH_TAGGED`| Tag-based with tagged args: `<param=key>value</param>`           |
-
-**`call_id_position`**: Where call IDs appear in tag-based formats.
-
-| Value                    | Description                                  |
-|--------------------------|----------------------------------------------|
-| `NONE`                   | No call ID support detected                  |
-| `PRE_FUNC_NAME`          | Before function name                         |
-| `BETWEEN_FUNC_AND_ARGS`  | Between function name and arguments          |
-| `POST_ARGS`              | After arguments                              |
-
-## Tool Calling Formats
-
-### JSON_NATIVE
-
-**Structure**: The entire tool call (function name, arguments, values) is in JSON format. Optional enclosing tags around the section.
-
-**Detection**: Function name appears inside a JSON structure (quotes preceded by `{` or `:`).
-
-**Examples**:
-
-Standard OpenAI-style:
-
-```json
-<tool_call>
-{"name": "get_weather", "arguments": {"location": "Paris", "unit": "celsius"}}
-</tool_call>
-```
-
-Mistral Nemo with array wrapper:
-
-```json
-[TOOL_CALLS]
-[{"name": "calculate", "arguments": {"expr": "2+2"}}]
-```
-
-Function name as JSON key (Apertus style):
-
-```json
-{"get_weather": {"location": "Paris"}}
-```
-
---
-
-### TAG_WITH_JSON
-
-**Structure**: Function name is outside JSON, in tag attributes or XML-style tags. Arguments are a JSON object.
-
-**Detection**: Function name not in JSON, but argument names appear in JSON context.
-
-**Examples**:
-
-Functionary v3.1:
-
-```xml
-<function=get_weather>{"location": "Paris", "unit": "celsius"}</function>
-```
-
-MiniMax:
-
-```xml
-<minimax:tool_call>
-<tool_name>calculate</tool_name>
-<arguments>{"expr": "2+2"}</arguments>
-</minimax:tool_call>
-```
-
---
-
-### TAG_WITH_TAGGED
-
-**Structure**: Both function name and argument names are in XML-style tags. String values are unquoted; non-string values are JSON-formatted.
-
-**Detection**: Neither function name nor argument names appear in a JSON context.
-
-**Examples**:
-
-Qwen/Hermes XML format:
-
-```xml
-<function=get_weather>
-<param=location>Paris</param>
-<param=unit>celsius</param>
-</function>
-```
-
-Mixed types:
-
-```xml
-<function=calculate>
-<param=expr>2+2</param>
-<param=precision>2</param>
-<param=options>{"round": true}</param>
-</function>
-```
-
-String values (`Paris`, `celsius`, `2+2`) are unquoted; `options` (object type) is JSON-formatted.
-
---
-
-## Analysis Flow
-
-```text
-autoparser::autoparser(tmpl)
-    |
-    |-- Phase 1: analyze_reasoning(tmpl, jinja_caps.supports_tool_calls)
-    |     |-- R1: compare_reasoning_presence()   — with/without reasoning_content field
-    |     |-- R2: compare_thinking_enabled()     — enable_thinking=false vs true
-    |     '-- R3: compare_reasoning_scope()      — reasoning+content vs reasoning+tools
-    |           (only if supports_tool_calls)
-    |
-    |-- Phase 2: analyze_content(tmpl, reasoning)
-    |     '-- C1: compares content-only vs tools output and content-only vs reasoning output
-    |
-    |-- Phase 3: analyze_tools(tmpl, jinja_caps, reasoning)
-    |     (skipped entirely if !jinja_caps.supports_tool_calls)
-    |     |
-    |     |-- T1: analyze_tool_calls()           — no tools vs with tools; classifies format
-    |     |         |-- JSON path → analyze_tool_call_format_json_native()
-    |     |         '-- tag path → analyze_tool_call_format_non_json()
-    |     |
-    |     (if format != NONE and format != JSON_NATIVE:)
-    |     |
-    |     |-- T2: check_per_call_markers()       — 1 call vs 2 calls; moves section→per-call if needed
-    |     |         (only if supports_parallel_tool_calls)
-    |     |
-    |     |-- T3: extract_function_markers()     — func_alpha vs func_beta; extracts name prefix/suffix/close
-    |     |
-    |     |-- T4: analyze_arguments()            — (TAG_WITH_TAGGED only)
-    |     |         |-- A1: extract_argument_name_markers()   — arg_name_A vs arg_name_B
-    |     |         '-- A2: extract_argument_value_markers()  — value "XXXX" vs "YYYY"
-    |     |
-    |     |-- T5: extract_argument_separator()   — 1 arg vs 2 args; finds separator between args
-    |     |
-    |     |-- T6: extract_args_markers()         — 0 args vs 1 arg; finds args container markers
-    |     |
-    |     '-- T7: extract_call_id_markers()      — call_id "call00001" vs "call99999"
-    |
-    '-- collect_preserved_tokens()               — union of all non-empty markers
-    |
-    '-- apply workarounds()                      — post-hoc patches for edge-case templates
-    |
-    v
-autoparser (analysis result)
-    |
-    v
-autoparser::peg_generator::generate_parser(tmpl, inputs, analysis)
-    |-- analysis.build_parser(inputs)            — builds PEG parser arena
-    |     |-- reasoning.build_parser(ctx)        — reasoning parser (mode-dependent)
-    |     |-- content.build_parser(ctx)          — content parser (mode-dependent)
-    |     '-- tools.build_parser(ctx)            — tool parser (dispatches by tool_format)
-    |           |-- build_tool_parser_json_native()
-    |           |-- build_tool_parser_tag_json()
-    |           '-- build_tool_parser_tag_tagged()
-    |
-    |-- Build GBNF grammar (if tools present and trigger_marker non-empty)
-    '-- Set grammar_triggers from section_start or per_call_start
-    |
-    v
-common_chat_params (prompt, parser, grammar, triggers, preserved_tokens)
-```
-
-## Entry Point
-
-The auto-parser is invoked in [common/chat.cpp:1280-1310](common/chat.cpp#L1280-L1310) in `common_chat_templates_apply_jinja`. A few specialized templates are handled first (Ministral/Magistral Large 3, GPT-OSS with `<|channel|>`, Functionary v3.2 with `>>>all`), then the auto-parser handles everything else via `autoparser::autoparser` + `peg_generator::generate_parser`.
-
-## Algorithm Details
-
-### Core Mechanism: Differential Comparison
-
-All analysis phases use the same factorized comparison function declared in [common/chat-auto-parser-helpers.h:68](common/chat-auto-parser-helpers.h#L68):
-
-```cpp
-compare_variants(tmpl, params_A, params_modifier)
-```
-
-This creates variant B by applying a modifier lambda to a copy of `params_A`, renders both through the template, and computes a `diff_split` ([common/chat-auto-parser.h:28-37](common/chat-auto-parser.h#L28-L37)):
-
- `prefix` — common prefix between A and B
- `suffix` — common suffix between A and B
- `left` — unique to variant A
- `right` — unique to variant B
-
-The diff is computed via `calculate_diff_split()`, which finds the longest-common-prefix and longest-common-suffix, then iteratively moves incomplete `<...>` or `[...]` markers from the prefix/suffix into left/right until stable (tag boundary fixing).
-
-Text is segmentized into markers and non-marker fragments using `segmentize_markers()`, which splits on `<...>` and `[...]` boundaries.
-
-### Phase 1: Reasoning Analysis
-
-**R1 — `compare_reasoning_presence()`**: Compares assistant message with vs without a `reasoning_content` field.
-
- Searches `diff.right` (output with reasoning) for the reasoning content needle
- Uses PEG parsers to find surrounding markers:
-  - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
-  - If both found but post marker only in the full output B → `FORCED_CLOSED`
-  - If only post marker found → `DELIMITER`
- Sets `reasoning.start` and `reasoning.end`
-
-**R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
-
- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
-
-**R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
-
- Only runs if `jinja_caps.supports_tool_calls`
- Detects `TOOLS_ONLY`: reasoning content present in B (with tools) but not in A (with text content)
- Extracts reasoning markers from the tool call output using PEG parsers
-
-### Phase 2: Content Analysis
-
-**C1**: Two comparisons in the `analyze_content` constructor:
-
- Comparison 1: content-only output vs tool-call output → `diff_tools`
- Comparison 2: content-only output vs reasoning+empty-content output → `diff_reasoning`
-
-Classification logic:
-
- `PLAIN`: `diff_tools.left` equals the response string (content is the entire diff, no wrapper)
- `ALWAYS_WRAPPED`: markers found surrounding the content text in `pure_content` → extracts `start`/`end`
-
-### Phase 3: Tool Call Analysis
-
-**T1 — `analyze_tool_calls()`**: Compares no-tools vs with-tools output.
-
- Extracts the tool call section as `diff.right`
- Calls `analyze_tool_call_format()` which first strips reasoning markers from the haystack, then:
-  - Calls `in_json_haystack()` for both function name and argument name needles
-  - `in_json_haystack()` uses a PEG parser to check whether the needle appears in a JSON context (preceded by `{` or `:` with surrounding quotes)
-  - If function name is in JSON → `JSON_NATIVE` → `analyze_tool_call_format_json_native()`
-  - If function name not in JSON, arg name is in JSON → `TAG_WITH_JSON`
-  - If neither in JSON → `TAG_WITH_TAGGED`
-  - `analyze_tool_call_format_json_native()`: parses the JSON object, matches field values to needles to populate `name_field`, `args_field`, `id_field`, `gen_id_field`; detects `tools_array_wrapped`; extracts `section_start`/`section_end`
-  - `analyze_tool_call_format_non_json()`: uses PEG parsers on the haystack to find up to two opening markers (section + per-call) then up to two closing markers
-
-**T2 — `check_per_call_markers()`**: Compares 1 call vs 2 calls.
-
- Computes a secondary diff of the second call portion vs the common suffix
- If the second call content starts with `section_start` → the section marker is actually per-call → moves `section_start/end` to `per_call_start/end` and clears the section markers
-
-**T3 — `extract_function_markers()`**: Compares function name `FUN_FIRST` vs `FUN_SECOND` (two different named functions).
-
- Finds where the function name appears in `diff.left`
- Extracts `function.name_prefix` from the common prefix up to the function marker, and `function.name_suffix` from after the name up to the next marker
- Extends `name_suffix` into `diff.suffix` (to the first marker for TAG_WITH_TAGGED; to the first `{` or `[` for TAG_WITH_JSON)
- Extracts `function.close` from after the last argument value up to the per-call/section end marker
-
-**T4 — `analyze_arguments()`** (TAG_WITH_TAGGED only):
-
- **A1 `extract_argument_name_markers()`**: Compares `arg_name_A` vs `arg_name_B` (two different argument names).
-  - Finds shared surrounding structure → `arguments.name_prefix`, `arguments.name_suffix`
- **A2 `extract_argument_value_markers()`**: Compares argument value `"XXXX"` vs `"YYYY"` (same arg, different value).
-  - Finds markers surrounding the value → `arguments.value_prefix`, `arguments.value_suffix`
-
-**T5 — `extract_argument_separator()`**: Compares 1 argument vs 2 arguments (same function).
-
- Uses `until_common_prefix(diff.right, ARG_FIRST, ARG_SECOND)` to find what separates the two argument blocks
-
-**T6 — `extract_args_markers()`**: Compares 0 arguments vs 1 argument.
-
- Uses `until_common_prefix()` and `after_common_suffix()` with the empty and single-arg JSON strings as anchors to find container markers (`arguments.start`, `arguments.end`)
-
-**T7 — `extract_call_id_markers()`**: Compares call IDs `"call00001"` vs `"call99999"`.
-
- Determines whether function name appears in `diff.prefix` or `diff.suffix` to classify position:
-  - Function name in prefix only → `BETWEEN_FUNC_AND_ARGS` or `POST_ARGS` (further distinguished by where `{` appears)
-  - Function name in suffix only → `PRE_FUNC_NAME`
- Extracts `call_id.prefix` and `call_id.suffix` markers around the call ID value
- Clears `per_call_end` if it incorrectly incorporated the call ID suffix
-
-### Workarounds
-
-A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds:
-
-1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')`: sets `reasoning.mode = FORCED_OPEN` with `<think>`/`</think>` markers if no reasoning was detected
-2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with `<think>`/`</think>` and `WRAPPED_WITH_REASONING` content with `<response>`/`</response>`
-3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set
-4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers
-5. **DeepSeek-R1-Distill-Qwen** — source contains `tool▁calls▁begin` markers: overrides tool section/per-call markers with the correct Unicode block characters
-
-### Parser Building
-
-Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) implements `build_parser(parser_build_context&)`. They share a `parser_build_context` that carries the PEG builder, inference inputs, the pre-built reasoning parser, and a pointer to the content analyzer.
-
-#### Reasoning Parser (`analyze_reasoning::build_parser`)
-
-| Mode                              | Parser                                                              |
-|-----------------------------------|---------------------------------------------------------------------|
-| Not extracting reasoning          | `eps()`                                                             |
-| `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
-| `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
-| `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |
-
-#### Content Parser (`analyze_content::build_parser`)
-
-| Condition                              | Parser                                                                          |
-|----------------------------------------|---------------------------------------------------------------------------------|
-| `json_schema` present                  | `reasoning + space() + content(schema(json(), "response-format", ...)) + end()` |
-| Tools present                          | Dispatches to `analyze_tools::build_parser()`                                   |
-| `ALWAYS_WRAPPED` with reasoning        | `reasoning + start + content(until(end)) + end + end()`                         |
-| `ALWAYS_WRAPPED` without reasoning     | `content(until(start)) + start + content(until(end)) + end + end()`             |
-| Default (PLAIN)                        | `reasoning + content(rest()) + end()`                                           |
-
-#### Tool Parsers (`analyze_tools::build_parser`)
-
-Dispatches by `format.mode`:
-
-**`build_tool_parser_json_native()`**: Calls `p.standard_json_tools()` which internally dispatches to:
-
- `build_json_tools_function_is_key()` — function name is the JSON key: `{"get_weather": {...}}`
- `build_json_tools_nested_keys()` — nested: `{"function": {"name": "X", "arguments": {...}}}`
- `build_json_tools_flat_keys()` — flat: `{"name": "X", "arguments": {...}}`
-
-Handles content wrappers, array wrapping (`tools_array_wrapped`), parallel calls, and `parameter_order`.
-
-**`build_tool_parser_tag_json()`**: For each tool function:
-
-```text
-tool_open(name_prefix + tool_name(literal(name)) + name_suffix) +
-    call_id_section +
-    tool_args(schema(json(), tool_schema))
-  [+ function.close if non-empty]
-```
-
-Wrapped in per-call markers (with optional parallel call repetition) then optionally in section markers.
-
-**`build_tool_parser_tag_tagged()`**: For each tool function, builds one parser per argument:
-
- String types: `tool_arg_string_value(schema(until(value_suffix), ...))`
- JSON types: `tool_arg_json_value(schema(json(), ...))`
- Required args are plain; optional args wrapped in `optional()`
- Arguments joined with `space()` between consecutive parsers
-
-For closing: uses `function.close` if present; otherwise uses `peek(per_call_end)` to avoid premature close during partial streaming; falls back to `tool_close(space())` to trigger mapper callbacks.
-
-All three tool parsers return:
-
-```text
-reasoning + optional(content(until(trigger_marker))) + tool_calls + end()
-```
-
-### Python Dict Format
-
-When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule.
-
-## Mapper
-
-`common_chat_peg_mapper` maps PEG parse results (AST nodes) into `common_chat_msg` structures. Key design:
-
- **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments`
- **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching
- **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format)
- **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`)
- **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically
-
-## Files
-
-| File                                      | Purpose                                                              |
-|-------------------------------------------|----------------------------------------------------------------------|
-| `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` |
-| `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods   |
-| `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                 |
-| `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`,                    |
-|                                           | `compare_variants()`, string helpers                                 |
-| `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers     |
-| `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                   |
-| `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                     |
-| `tools/parser/template-analysis.cpp`      | Template analysis tool                                               |
-
-## Testing & Debugging
-
-### Debug Tools
-
-**Template Debugger**: `tools/parser/debug-template-parser.cpp`
-
- Usage: `./bin/llama-debug-template-parser path/to/template.jinja`
- Shows detected format, markers, generated parser, and GBNF grammar
-
-**Template Analysis**: `tools/parser/template-analysis.cpp`
-
- Usage: `./bin/llama-template-analysis path/to/template.jinja`
-
-**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
-
- Shows detailed analysis steps, pattern extraction results, and generated parser structure
-
-**PEG Test Builder**: Fluent API for creating test cases — see [tests/test-chat.cpp:947-1043](tests/test-chat.cpp#L947-L1043). Example usage:
-
-```cpp
-auto tst = peg_tester("models/templates/Template.jinja");
-tst.test("input text")
-   .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-   .tools({tool_json})
-   .parallel_tool_calls(true)
-   .enable_thinking(true)
-   .expect(expected_message)
-   .run();
-```
-
-### Tested Templates
-
-The following templates have active tests in `tests/test-chat.cpp`:
-
-| Template | Format | Notes |
-| -------- | ------ | ----- |
-| Ministral-3-14B-Reasoning | Reasoning | `[THINK]...[/THINK]` tags (specialized handler) |
-| NVIDIA-Nemotron-3-Nano-30B | TAG_WITH_TAGGED | Reasoning + tools |
-| CohereForAI Command-R7B | JSON_NATIVE | `<\|START_THINKING\|>`/`<\|START_RESPONSE\|>` markers |
-| Google Gemma 2 2B | Content only | No tool support |
-| Qwen-QwQ-32B | Reasoning | Forced-open thinking |
-| NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
-| IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
-| ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
-| Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
-| DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
-| GLM-4.6 | TAG_WITH_TAGGED | `<tool_call>name\n<arg_key>...<arg_value>...` format |
-| GLM-4.7-Flash | TAG_WITH_TAGGED | Updated GLM format |
-| Kimi-K2-Thinking | JSON_NATIVE | Reasoning + JSON tools |
-| Apertus-8B-Instruct | JSON_NATIVE | Function name as JSON key |
-| MiniMax-M2 | TAG_WITH_JSON | XML invoke with JSON args |
-| NVIDIA-Nemotron-Nano-v2 | JSON_NATIVE | `<TOOLCALL>` wrapper (nested) |
-| CohereForAI Command-R Plus | JSON_NATIVE | Markdown code block format |
-| Mistral-Nemo-Instruct-2407 | JSON_NATIVE | `[TOOL_CALLS]` wrapper with ID field |
-| Functionary v3.1 | TAG_WITH_JSON | `<function=X>` format |
-| Functionary v3.2 | Specialized | `>>>` recipient delimiter (dedicated handler) |
-| Fireworks Firefunction v2 | TAG_WITH_JSON | Fireworks tool format |
-| DeepSeek R1 Distill (Llama/Qwen) | Reasoning | Forced-open thinking |
-| llama-cpp-deepseek-r1 | Reasoning | Forced-open thinking |
-| Kimi-K2 / Kimi-K2-Instruct | JSON_NATIVE | JSON tools with special markers |
-| Llama 3.1/3.2/3.3 | JSON_NATIVE | Standard Llama tool format |
-| OpenAI GPT-OSS | Specialized | Channel-based (dedicated handler) |
-| Apriel 1.5 | JSON_NATIVE | `<tool_calls>` wrapper with JSON array |
-| Apriel 1.6 Thinker | Reasoning | Implicit reasoning start |
-| Mistral Small 3.2 | JSON_NATIVE | `[TOOL_CALLS]func[ARGS]{...}` with call ID |
-| Devstral | JSON_NATIVE | `[TOOL_CALLS]func[ARGS]{...}` without call ID |
-| StepFun 3.5 Flash | TAG_WITH_TAGGED | `<function=X><parameter=Y>` format |
-
-## Adding Support for New Templates
-
-To support a new template format:
-
-1. **If it follows standard patterns** — The auto-parser should detect it automatically. Run `llama-debug-template-parser` to verify markers are correctly extracted.
-2. **If differential analysis extracts incorrect markers** — Add a workaround lambda to the `workarounds` vector in `common/chat-diff-analyzer.cpp`. Inspect the template source for a unique identifying substring.
-3. **If it needs fundamentally different handling** — Add a dedicated handler function in `chat.cpp` before the auto-parser block (as done for GPT-OSS, Functionary v3.2, and Ministral).
-
-## Edge Cases and Quirks
-
-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
-2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
-3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
-4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
-5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
-6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
-7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -20,7 +20,7 @@

 **Llama.cpp + CANN**

-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.

 ## News

@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
    # and install driver.
    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
    ```
-    If the following message appears, firmware is installed successfully.
+    If the following messaage appers, firmware is installed successfully.
    ```sh
    Firmware package installed successfully!
    ```
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -9,7 +9,6 @@
 - [Linux](#linux)
 - [Windows](#windows)
 - [Environment Variable](#environment-variable)
- [Design Rule](#design-rule)
 - [Known Issue](#known-issues)
 - [Q&A](#qa)
 - [TODO](#todo)
@ -42,9 +41,6 @@ The following releases are verified and recommended:

 ## News

- 2026.03
-  - Support Flash-Attention: less memory usage, performance impact depends on LLM.
-
 - 2026.02
  - Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.

@ -382,27 +378,17 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Windows

-### Install GPU driver
+### I. Setup Environment
+
+1. Install GPU driver

 Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

-### Option 1: download the binary package directly
-
-Download the binary package for Windows from: https://github.com/ggml-org/llama.cpp/releases.
-
-Extract the package to local folder, run the llama tools directly. Refer to [Run the inference](#iii-run-the-inference-1).
-
-Note, the package includes the SYCL running time and all depended dll files, no need to install oneAPI package and activte them.
-
-### Option 2: build locally from the source code.
-
-#### I. Setup environment
-
-1. Install Visual Studio
+2. Install Visual Studio

 If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).

-2. Install Intel® oneAPI Base toolkit
+3. Install Intel® oneAPI Base toolkit

 SYCL backend depends on:
  - Intel® oneAPI DPC++/C++ compiler/running-time.
@ -453,25 +439,25 @@ Output (example):
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```

-3. Install build tools
+4. Install build tools

 a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)


-#### II. Build llama.cpp
+### II. Build llama.cpp

 You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.

 Choose one of following methods to build from source code.

-##### Option 1: Script
+#### 1. Script

 ```sh
 .\examples\sycl\win-build-sycl.bat
 ```

-##### Option 2: CMake
+#### 2. CMake

 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

@ -500,7 +486,7 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-completion
 ```

-##### Option 3: Visual Studio
+#### 3. Visual Studio

 You have two options to use Visual Studio to build llama.cpp:
 - As CMake Project using CMake presets.
@ -510,7 +496,7 @@ You have two options to use Visual Studio to build llama.cpp:

 All following commands are executed in PowerShell.

-###### - Open as a CMake Project
+##### - Open as a CMake Project

 You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:

@ -525,7 +511,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
    cmake --build build --config Release -j --target llama-completion
    ```

-###### - Generating a Visual Studio Solution
+##### - Generating a Visual Studio Solution

 You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.

@ -613,7 +599,7 @@ found 2 SYCL devices:

 ```

-##### Choose level-zero devices
+#### Choose level-zero devices

 |Chosen Device ID|Setting|
 |-|-|
@ -621,7 +607,7 @@ found 2 SYCL devices:
 |1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
 |0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|

-##### Execute
+#### Execute

 Choose one of following methods to run.

@ -679,7 +665,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Environment Variable

-### Build
+#### Build

 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
@ -694,50 +680,23 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.

-### Runtime
+#### Runtime

 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
-| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

-## Design Rule

- Open to all contributors.
-
- All code change should be useful to user:
-    - Fix bug.
-    - Add new function.
-    - Improve the performance/usage.
-    - Make code be easy to maintain.
-    - ...
-
- Don't accept the codes of following cases:
-    - Break legacy function.
-    - Reduce the performance of legacy case in default.
-    - Not completed work/the functionality cannot be demonstrated.
-
- Encourage to use environment variable to control features to be opened/closed.
-    - User can evaluate the feature without rebuild the code.
-    - Recommend the best features to user by setting them be opened as default.
-
- Design the code based on the published official releases of oneAPI packages: compiler, library, driver, OS kernel.
-
- Developers need to maintain the code they submit.

 ## Known Issues

 - `Split-mode:[row]` is not supported.

- Missed the AOT (Ahead-of-Time) in buiding.
-  - Good: build quickly, smaller size of binary file.
-  - Bad: The startup is slow (JIT) in first time, but subsequent performance is unaffected.
-
 ## Q&A

 - Error:  `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
@ -749,7 +708,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  - Remove **build** folder or try a clean-build.

- I can **not** see `[ext_oneapi_level_zero:gpu]` after installing the GPU driver on Linux.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

  Please double-check with `sudo sycl-ls`.

@ -787,7 +746,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  ```

 ### **GitHub contribution**:
-Please add the `[SYCL]` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
+Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

 ## TODO

--- a/docs/backend/VirtGPU/development.md
+++ b/docs/backend/VirtGPU/development.md
@ -55,8 +55,7 @@ LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
 cmake -S . -B $LLAMA_MAC_BUILD \
      -DGGML_NATIVE=OFF \
      -DLLAMA_CURL=ON \
-      -DGGML_VIRTGPU=ON \
-      -DGGML_VIRTGPU_BACKEND=ONLY \
+      -DGGML_REMOTINGBACKEND=ONLY \
      -DGGML_METAL=ON

 TARGETS="ggml-metal"
@ -72,7 +71,6 @@ cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
 ```bash
 # Build virglrenderer with APIR support
 mkdir virglrenderer
-cd virglrenderer
 git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
 cd src

@ -97,7 +95,7 @@ mkdir llama.cpp
 git clone https://github.com/ggml-org/llama.cpp.git src
 cd src

-LLAMA_LINUX_BUILD=$PWD/build-virtgpu
+LLAMA_LINUX_BUILD=$PWD//build-virtgpu

 cmake -S . -B $LLAMA_LINUX_BUILD \
      -DGGML_VIRTGPU=ON
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@ -116,7 +116,7 @@ Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920
 ### Windows

 All artifacts are already installed in the `pkg-snapdragon` folder.
-To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.
+To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.

 ## How to Run

--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@ -144,7 +144,7 @@ Once the build is complete HTP ops libraries will be installed like this
 -a----         1/22/2026   6:01 PM           4139 libggml-htp.cat
 ```

-The .cat file, the signature and proper certificate installation can be verified with
+The .cat file, the signature and proper certicate installation can be verified with

 ```
 > signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
--- a/docs/build.md
+++ b/docs/build.md
@ -108,7 +108,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 - Using oneAPI docker image:
  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.

-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://builders.intel.com/solutionslibrary/optimizing-and-running-llama2-on-intel-cpu) for more information.
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.

 ### Other BLAS libraries

@ -595,17 +595,11 @@ You can verify that KleidiAI is being used by running
 ```bash
 ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
 ```
-If KleidiAI is enabled, the output will contain a line similar to:
+If KleidiAI is enabled, the ouput will contain a line similar to:
 ```
 load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
 ```
-KleidiAI’s microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm, SVE, and SME. Llama.cpp selects the most efficient kernels at runtime based on detected CPU capabilities.
-On CPUs that support SME, SME microkernels are enabled automatically using runtime detection.
-The environment variable GGML_KLEIDIAI_SME can be used to control SME behavior:
- Not set: enable SME automatically if supported and detected.
- 0: disable SME.
- <n> > 0: enable SME and assume <n> available SME units (override auto detection).
-If SME is not supported by the CPU, SME microkernels are always disabled.
+KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.

 Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.

@ -705,7 +699,7 @@ To read documentation for how to build on Android, [click here](./android.md)

 ## WebGPU [In Progress]

-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.

 In the llama.cpp directory, build with CMake:

--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@ -22,7 +22,7 @@ Below is a contrived example demonstrating how to use the PEG parser to parse
 output from a model that emits arguments as JSON.

 ```cpp
-auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
    // Build a choice of all available tools
    auto tool_choice = p.choice();
    for (const auto & tool : tools) {
@ -212,7 +212,7 @@ mapper.from_ast(ctx.ast, result);

 ### Native

-The `common_chat_peg_builder` builds a `native` parser suitable for
+The `common_chat_peg_native_builder` builds a `native` parser suitable for
 models that emit tool arguments as a direct JSON object.

 - **`reasoning(p)`** - Tag node for `reasoning_content`
@ -225,7 +225,7 @@ models that emit tool arguments as a direct JSON object.
 - **`tool_args(p)`** - Tag the tool arguments

 ```cpp
-build_chat_peg_parser([&](common_chat_peg_builder & p) {
+build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
    auto get_weather_tool = p.tool(p.sequence({
        p.tool_open(p.literal("{")),
        p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
@ -246,7 +246,7 @@ build_chat_peg_parser([&](common_chat_peg_builder & p) {

 ### Constructed

-The `common_chat_peg_builder` builds a `constructed` parser
+The `common_chat_peg_constructed_builder` builds a `constructed` parser
 suitable for models that emit tool arguments as separate entities, such as XML
 tags.

@ -264,7 +264,7 @@ tags.
 - **`tool_arg_json_value(p)`** - Tag JSON value for the argument

 ```cpp
-build_chat_peg_parser([&](common_chat_peg_builder & p) {
+build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
    auto location_arg = p.tool_arg(
        p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
        p.tool_arg_string_value(p.until("</parameter>")),
--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@ -281,7 +281,7 @@ llama_print_timings:       total time =    5990.25 ms /   202 tokens

 Just the same as above.

-**output**
+**ouput**
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens

@ -305,7 +305,7 @@ llama_print_timings:       total time =   15513.95 ms /   412 tokens
 ## Run on Intel(R) Core(TM) Ultra7 115H
 ### operation system
 Windows11
-### compile
+### comiple
 ```sh
 make -j32
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@ -23,31 +23,30 @@ Legend:
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -55,7 +54,7 @@ Legend:
 |                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -64,7 +63,7 @@ Legend:
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@ -76,34 +75,34 @@ Legend:
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -117,5 +116,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
@ -5023,20 +5023,20 @@
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[1024,12,1,1]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[2000,10,1,1]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[5438,3,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","0","no","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","0","no","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","0","no","WebGPU"
@ -9535,38 +9535,38 @@
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","WebGPU"
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -5,7 +5,6 @@
 #include "sampling.h"

 #include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.prompt = "Hello my name is";
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -5,16 +5,14 @@
 #include "common.h"
 #include "log.h"

-#include <algorithm>
-#include <cassert>
-#include <cinttypes>
-#include <climits>
-#include <clocale>
-#include <cstdarg>
-#include <cstring>
-#include <ctime>
 #include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <cinttypes>
+#include <ctime>
 #include <random>
 #include <stdexcept>
 #include <sstream>
@ -876,8 +874,6 @@ static std::string basename(const std::string &path) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_init();

    struct train_params params = get_default_train_params();
--- a/examples/debug/README.md
+++ b/examples/debug/README.md
@ -2,7 +2,7 @@

 This is a utility intended to help debug a model by registering a callback that
 logs GGML operations and tensor data. It can also store the generated logits or
-embeddings as well as the prompt and token ids for comparison with the original
+embeddings as well as the prompt and token ids for comparision with the original
 model.

 ### Usage
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -1,14 +1,11 @@
 // Warns users that this filename was deprecated, and provides a link for more information.

-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <unordered_map>

 // Main
 int main(int argc, char** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@ -43,12 +43,12 @@ Choose one of the following scheduling methods:
 - `-b`: Batch size

 ### Examples
-#### Dream architecture:
+#### Dream architechture:
 ```
 llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
 ```

-#### LLaDA architecture:
+#### LLaDA architechture:
 ```
 llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
 ```
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -7,7 +7,6 @@
 #include <limits.h>

 #include <algorithm>
-#include <clocale>
 #include <cmath>
 #include <cstring>
 #include <limits>
@ -539,8 +538,6 @@ static std::string format_input_text(const std::string & prompt, const std::stri
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    ggml_time_init();

    common_params params;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <ctime>
 #include <algorithm>

@ -95,8 +94,6 @@ static void print_raw_embeddings(const float * emb,
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -4,8 +4,6 @@
 #include "log.h"
 #include "llama.h"
 #include "llama-cpp.h"
-
-#include <clocale>
 #include <string>
 #include <vector>

@ -31,8 +29,6 @@ static bool run(llama_context * ctx, const common_params & params) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    base_callback_data cb_data;

    common_params params;
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,7 +1,6 @@
 #include "arg.h"
 #include "common.h"

-#include <clocale>
 #include <fstream>
 #include <sstream>
 #include <string>
@ -101,8 +100,6 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
 }

 int main(int, char **) {
-    std::setlocale(LC_NUMERIC, "C");
-
    for (const auto & md : md_files) {
        std::ifstream infile(md.fname);
        if (!infile.is_open()) {
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -1,14 +1,13 @@
 #include "ggml.h"
 #include "gguf.h"

-#include <algorithm>
-#include <clocale>
+#include <cstdlib>   /* abort() */
 #include <cstddef>
 #include <cstdio>
-#include <cstdlib>   /* abort() */
-#include <cstring>
-#include <stdexcept>
 #include <string>
+#include <stdexcept>
+#include <algorithm>
+#include <cstring>

 #include <sstream>
 #include <fstream>
@ -627,8 +626,6 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
 }

 int main(int argc, const char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    hash_params params;
    manifest_check_params manifest_check;
    hash_params_parse(argc, argv, params);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "gguf.h"

-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <sstream>
@ -241,8 +240,6 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    if (argc < 3) {
        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
        printf("r: read data.gguf file\n");
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -633,7 +633,7 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))

        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
-            items = schema.get('items', schema.get('prefixItems'))
+            items = schema.get('items') or schema['prefixItems']
            if isinstance(items, list):
                return self._add_rule(
                    rule_name,
@ -689,11 +689,6 @@ class SchemaConverter:
        elif (schema_type == 'object') or (len(schema) == 0):
            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))

-        elif schema_type is None and isinstance(schema, dict):
-            # No type constraint and no recognized structural keywords (e.g. {"description": "..."}).
-            # Per JSON Schema semantics this is equivalent to {} and accepts any value.
-            return self._add_rule(rule_name, self._add_primitive('value', PRIMITIVE_RULES['value']))
-
        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
--- a/examples/llama.vim
+++ b/examples/llama.vim
@ -52,8 +52,8 @@ highlight llama_hl_info guifg=#77ff2f ctermfg=119
 "   n_prefix:         number of lines before the cursor location to include in the local prefix
 "   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max allotted time for the prompt processing (TODO: not yet supported)
-"   t_max_predict_ms: max allotted time for the prediction
+"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max alloted time for the prediction
 "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 "   auto_fim:         trigger FIM completion automatically on cursor movement
 "   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -4,11 +4,10 @@
 #include "log.h"
 #include "llama.h"

-#include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <algorithm>

 struct ngram_data {
    bool active = false;
@ -39,8 +38,6 @@ struct ngram_container {
 };

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -3,13 +3,10 @@
 #include "ngram-cache.h"
 #include "llama.h"

-#include <clocale>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@ -3,7 +3,6 @@
 #include "common.h"
 #include "ngram-cache.h"

-#include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@ -18,8 +17,6 @@ static void print_usage(char* argv0) {
 }

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    if (argc < 3) {
        print_usage(argv[0]);
        exit(1);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -5,17 +5,14 @@
 #include "llama.h"
 #include "ggml.h"

-#include <cinttypes>
-#include <clocale>
 #include <cstdint>
 #include <cstdio>
+#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -6,7 +6,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@ -14,8 +13,6 @@
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -69,7 +69,7 @@ Command line arguments take precedence over environment variables when both are

 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will then cause the transformer implementation to be loaded explicitly and not
+will then cause the transformer implementation to be loaded explicitely and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
@ -120,7 +120,7 @@ The converted model can be inspected using the following command:
 (venv) $ make causal-run-converted-model
 ```

-### Model logits verification
+### Model logits verfication
 The following target will run the original model and the converted model and
 compare the logits:
 ```console
@ -235,7 +235,7 @@ new model the model can be converted to GGUF format using the following command:
 (venv) $ make embedding-run-converted-model
 ```

-### Model logits verification
+### Model logits verfication
 The following target will run the original model and the converted model (which
 was done manually in the previous steps) and compare the logits:
 ```console
@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO

 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a release
+on Hugging Face in the the ggml-org. These can be used when preparing a relase
 to script the process for new model releases.

 For the following targets a `HF_TOKEN` environment variable is required.
@ -347,7 +347,7 @@ For the following targets a `HF_TOKEN` environment variable is required.
 > $ unset HF_TOKEN

 ### Create a new Hugging Face Model (model repository)
-This will create a new model repository on Hugging Face with the specified
+This will create a new model repsository on Hugging Face with the specified
 model name.
 ```console
 (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -7,13 +7,12 @@
 #include "log.h"
 #include "llama.h"

-#include <algorithm>
-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <ctime>
+#include <algorithm>

 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@ -154,8 +153,6 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    srand(1234);

    common_params params;
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <string>
@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.n_junk = 250;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -4,7 +4,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <clocale>
 #include <fstream>
 #include <iostream> // TODO: remove me

@ -113,8 +112,6 @@ static void batch_process(llama_context * ctx, llama_batch & batch, float * outp
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -2,14 +2,11 @@
 #include "common.h"
 #include "llama.h"

-#include <clocale>
 #include <vector>
 #include <cstdio>


 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.prompt = "The quick brown fox";
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -1,5 +1,4 @@
 #include "llama.h"
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
@ -13,8 +12,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    std::string model_path;
    int ngl = 99;
    int n_ctx = 2048;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -1,5 +1,4 @@
 #include "llama.h"
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <string>
@ -12,8 +11,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    // path to the model gguf file
    std::string model_path;
    // prompt to generate text from
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -5,15 +5,12 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -5,7 +5,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <random>
@ -31,8 +30,6 @@ struct seq_draft {
 };

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    // needed to get candidate probs even for temp <= 0.0
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -6,11 +6,11 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.

 |Tool Name| Function|Status|
 |-|-|-|
-|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, etc.|Support|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|

 ### llama-ls-sycl-device

-List all SYCL devices with ID, compute capability, max work group size, etc.
+List all SYCL devices with ID, compute capability, max work group size, ect.

 1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.

--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -6,10 +6,8 @@


 #include "ggml-sycl.h"
-#include <clocale>

 int main() {
-    std::setlocale(LC_NUMERIC, "C");
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@ -15,8 +14,6 @@
 #endif

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;
    params.escape = false;

--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -259,7 +259,7 @@ extern "C" {
      Example usage:

        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferably to run on the same backend as the buffer
+        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -138,7 +138,7 @@ extern "C" {
    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);

-    // set gradients to zero, initialize loss, and optionally reset the optimizer
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

    GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -8,12 +8,7 @@ extern "C" {

 #define RPC_PROTO_MAJOR_VERSION    3
 #define RPC_PROTO_MINOR_VERSION    6
-#define RPC_PROTO_PATCH_VERSION    1
-
-#ifdef  __cplusplus
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
-#endif
-
+#define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

 // backend API
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -427,8 +427,7 @@ extern "C" {
        // GGML_TYPE_IQ4_NL_4_8 = 37,
        // GGML_TYPE_IQ4_NL_8_8 = 38,
        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
-        GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
-        GGML_TYPE_COUNT   = 41,
+        GGML_TYPE_COUNT   = 40,
    };

    // precision
@ -464,7 +463,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_NVFP4   = 26, // except 1d tensors
    };

    // available tensor operations:
@ -558,7 +556,6 @@ extern "C" {
        GGML_OP_GATED_LINEAR_ATTN,
        GGML_OP_RWKV_WKV7,
        GGML_OP_SOLVE_TRI,
-        GGML_OP_GATED_DELTA_NET,

        GGML_OP_UNARY,

@ -2466,17 +2463,6 @@ extern "C" {
        bool                  lower,
        bool                  uni);

-    // TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
-    // ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
-    GGML_API struct ggml_tensor * ggml_gated_delta_net(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            struct ggml_tensor  * g,
-            struct ggml_tensor  * beta,
-            struct ggml_tensor  * state);
-
    // custom operators

    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
@ -2589,7 +2575,7 @@ extern "C" {
        struct ggml_tensor *  grad,
        struct ggml_tensor *  sgd_params); // alpha, weight decay

-    // build forward multiple tensors and select one of them for computing
+    // build forward mutiple tensors and select one of them for computing
    // this is useful for creating graphs that have constant topology but compute different things based on the input
    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
    //
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -339,8 +339,8 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
 }

 static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // no memory to report
-    *free  = 0;
+    // TODO
+    *free = 0;
    *total = 0;

    GGML_UNUSED(dev);
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -102,9 +102,6 @@ typedef sycl::half2 ggml_half2;
 #define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
 #define QR_MXFP4 2

-#define QI_NVFP4 (QK_NVFP4 / (4 * QR_NVFP4))
-#define QR_NVFP4 2
-
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2

@ -197,14 +194,6 @@ typedef struct {
 } block_mxfp4;
 static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");

-#define QK_NVFP4 64
-#define QK_NVFP4_SUB 16  // sub-block size for per-group scales
-typedef struct {
-    uint8_t d[QK_NVFP4/QK_NVFP4_SUB]; // UE4M3 scales (4 bytes, one per 16-element sub-block)
-    uint8_t qs[QK_NVFP4/2];           // packed 4-bit E2M1 values (32 bytes)
-} block_nvfp4;
-static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
-
 #define QK5_0 32
 typedef struct {
    ggml_half d;           // delta
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -566,9 +566,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.22.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.16.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "54049037570ab0ee0a0d126b2ba5ece1")
+        set(KLEIDIAI_ARCHIVE_MD5  "0a9e9008adb6031f9e8cf70dff4a3321")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
@ -608,7 +608,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@ -649,6 +648,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        if (NOT SME_ENABLED MATCHES -1)
            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
@ -656,13 +656,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_f16pmrx2_f32_neon.c
                ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
-            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2+sme2+fp16")
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()

        if (NOT SVE_ENABLED MATCHES -1)
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@ -9,8 +9,6 @@

 #if defined(GGML_USE_OPENMP)
 #include <omp.h>
-#else
-#include <thread>
 #endif

 #define TILE_M 16
@ -58,40 +56,18 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
 }

 template <typename func_t>
-inline void parallel_for(int n, const func_t & f) {
-    if (n <= 0) {
-        return;
-    }
+inline void parallel_for(int n, const func_t& f) {
 #if defined(GGML_USE_OPENMP)
-    #pragma omp parallel
-    {
-        int nth = omp_get_num_threads();
-        int ith = omp_get_thread_num();
-        int tbegin, tend;
-        balance211(n, nth, ith, tbegin, tend);
-        f(tbegin, tend);
-    }
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
 #else
-    int nth = std::thread::hardware_concurrency();
-    if (nth <= 1) {
-        f(0, n);
-        return;
-    }
-    if (nth > n) {
-        nth = n;
-    }
-    std::vector<std::thread> threads;
-    threads.reserve(nth);
-    for (int ith = 0; ith < nth; ++ith) {
-        threads.emplace_back([&f, n, ith, nth] {
-            int tbegin, tend;
-            balance211(n, nth, ith, tbegin, tend);
-            f(tbegin, tend);
-        });
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
+    f(0, n);
 #endif
 }

--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@ -195,7 +195,7 @@ struct tile_config_t{
 // will be needed.
 //
 // Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
-// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
+// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
 //
 // ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
 //    advanced-matrix-extensions-intrinsics-functions.html
@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
        // sum of offsets, shared across COLS
        //
        // avx512-vnni does not have `_mm512_dpbssd_epi32`,
-        // need to transform ss to us:
-        //   a * (b - 8) is equivalent to b * a - 8 * a
+        // need to transfrom ss to us:
+        //   a * (b - 8) is equavilent to b * a - 8 * a
        //   s    u   u                   u   s   u   s
        //
        __m512i vcomp;
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -15,7 +15,6 @@
 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
 #define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@ -80,8 +79,6 @@
 #define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
-// quants.c
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@ -111,7 +108,6 @@
 // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
@ -159,7 +155,6 @@
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -206,11 +201,9 @@
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
-#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
-#define ggml_quantize_mat_q8_K_4x1_generic ggml_quantize_mat_q8_K_4x1
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
@ -246,7 +239,6 @@
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@ -309,7 +301,6 @@
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@ -650,90 +650,6 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;
 }

-void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_NVFP4 == 0);
-
-    const block_nvfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    // Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
-    const int nb = n / QK_NVFP4;
-
-    float sumf = 0;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_mxfp4);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    float32x4_t acc = vdupq_n_f32(0.0f);
-
-    for (int ib = 0; ib < nb; ++ib) {
-        const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
-        const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
-
-        const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_0, m4b));
-        const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
-        const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_1, m4b));
-        const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
-
-        const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
-        const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
-        const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
-        const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
-
-        const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
-        const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
-        const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
-        const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
-
-        const int32x4_t p0 = vaddq_s32(
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
-        const int32x4_t p1 = vaddq_s32(
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
-
-        const int32x4_t sums = vpaddq_s32(p0, p1);
-
-        // Decode 4 UE4M3 scales to f32 and multiply with q8 scales
-        const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
-        const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
-        const float32x4_t nvsc = {
-            ggml_ue4m3_to_fp32(x[ib].d[0]),
-            ggml_ue4m3_to_fp32(x[ib].d[1]),
-            ggml_ue4m3_to_fp32(x[ib].d[2]),
-            ggml_ue4m3_to_fp32(x[ib].d[3])
-        };
-        const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
-
-        acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
-    }
-    sumf = vaddvq_f32(acc);
-#else
-    for (int ib = 0; ib < nb; ++ib) {
-        for (int si = 0; si < 4; ++si) {
-            const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
-            const int q8b = si / 2;
-            const int q8o = (si % 2) * QK_NVFP4_SUB;
-            const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
-
-            int sumi_lo = 0, sumi_hi = 0;
-            for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
-                const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
-                sumi_lo += y[2*ib + q8b].qs[q8o + j +               0] * kvalues_mxfp4[qv & 0xf];
-                sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >>  4];
-            }
-            sumf += dy * d * (sumi_lo + sumi_hi);
-        }
-    }
-#endif
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -1052,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    const int vector_length = ggml_cpu_get_sve_cnt()*8;

-    //VLA Implementation for SVE
+    //VLA Implemenation for SVE
    switch (vector_length) {
        case 128:
            {
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,

                const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;

-                // Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
+                // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
                // but still need the qs to use the low and hi bits from q4
                const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
                int8x16_t      q8_qs[8];
@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,

                for (int b = 0; b < nb; b++) {
                    // bsums pairs belongs to the same q8_k subblock
-                    // 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
+                    // 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
                    const int16x8_t bsums[4]{
                        vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
                        vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
--- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@ -423,7 +423,7 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
            quants_interleaved[j] = i0;
        }

-        // Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
+        // Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
        __m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
        shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
        __m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
@ -625,7 +625,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
                iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));

-                // Accumulated values multiplied with appropriate scales
+                // Accumulated values multipled with appropriate scales
                acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
            }

@ -868,7 +868,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
                    const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);

-                    // Multiply with appropriate scales and accumulate
+                    // Multiply with appropiate scales and accumulate
                    acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
                    acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
                    acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -1076,7 +1076,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
                const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);

-                // Multiply with appropriate scales and accumulate
+                // Multiply with appropiate scales and accumulate
                acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
                acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
                acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -1257,7 +1257,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);

-                    // Multiply with appropriate scales and accumulate
+                    // Multiply with appropiate scales and accumulate
                    acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                    acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                    acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -1428,7 +1428,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);

-                // Multiply with appropriate scales and accumulate
+                // Multiply with appropiate scales and accumulate
                acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -1612,7 +1612,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);

                    // Dot product done within 32 bit lanes and accumulated in the same vector
-                    // First done for first sub block and then for second sub block in each sb
+                    // First done for first sub block and thenn for second sub block in each sb
                    // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
                    // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
                    // ...........................................................................
@ -2422,7 +2422,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                        const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                        const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);

-                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
                        acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                        acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -2785,7 +2785,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                    const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);

-                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
                    acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                    acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                    acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -2802,7 +2802,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                }
            }
-            // Store accumulated values
+            // Store accumlated values
            for (int i = 0; i < 4; i++) {
                _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
            }
@ -3130,7 +3130,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                        const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                        const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);

-                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -3460,7 +3460,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                    const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);

-                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@ -4268,7 +4268,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                        const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                        const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);

-                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
                        acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                        acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -5035,7 +5035,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                }
            }
-            // Store accumulated values
+            // Store accumlated values
            for (int i = 0; i < 4; i++) {
                _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
            }
@ -5677,7 +5677,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                        const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                        const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);

-                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@ -6349,7 +6349,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                    const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                    const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);

-                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -270,12 +270,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
    },
-    [GGML_TYPE_NVFP4] = {
-        .from_float               = quantize_row_nvfp4,
-        .vec_dot                  = ggml_vec_dot_nvfp4_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-    },
    [GGML_TYPE_Q2_K] = {
        .from_float               = quantize_row_q2_K,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
@ -2027,10 +2021,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_solve_tri(params, tensor);
            } break;
-        case GGML_OP_GATED_DELTA_NET:
-            {
-                ggml_compute_forward_gated_delta_net(params, tensor);
-            } break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
@ -2210,7 +2200,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_COUNT_EQUAL:
        case GGML_OP_SOLVE_TRI:
-        case GGML_OP_GATED_DELTA_NET:
            {
                n_tasks = n_threads;
            } break;
@ -2488,7 +2477,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {

    if (prio != GGML_SCHED_PRIO_LOW) {
        // Tell Windows that this thread should not be throttled (needs its own CPU core).
-        // Newer Windows 11 versions aggressively park (offline) CPU cores and often place
+        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
        // all our threads onto the first 4 cores which results in terrible performance with
        // n_threads > 4
        #if _WIN32_WINNT >= 0x0602
@ -2916,11 +2905,6 @@ struct ggml_cplan ggml_graph_plan(
                    {
                        cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                    } break;
-                case GGML_OP_GATED_DELTA_NET:
-                    {
-                        const int64_t S_v = node->src[2]->ne[0];
-                        cur = S_v * sizeof(float) * n_tasks;
-                    } break;
                case GGML_OP_COUNT:
                    {
                        GGML_ABORT("fatal error");
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // SPDX-License-Identifier: MIT
 //

@ -9,6 +9,7 @@
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
+#include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
 #include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
@ -19,7 +20,6 @@
 #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
-#include "kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa.h"

 #include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
 #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
@ -31,7 +31,6 @@
 #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
 #include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
-#include "kai_lhs_pack_f16pmrx2_f32_neon.h"

 #include "kai_common.h"

@ -310,24 +309,24 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
    {
        /* SME GEMM */
        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa,
-            /* .get_lhs_offset_ex     = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa>,
-            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa>,
-            /* .run_kernel_ex         = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa>,
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
+            /* .run_kernel_ex         = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
        },

        /* .gemm_lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_pack_f16pmrx2_f32_neon,
-            /* .get_packed_offset_ex  = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_pack_f16pmrx2_f32_neon>,
-            /* .packed_size_ex        = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_pack_f16pmrx2_f32_neon>,
-            /* .pack_func_ex          = */ &lhs_pack_void_fn10<kai_run_lhs_pack_f16pmrx2_f32_neon>,
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon>,
+            /* .packed_size_ex        = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32_neon>,
        },
        /* SME GEMV */
        /* .kern_info = */ {
@ -520,7 +519,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
            /* .pack_func_ex          = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
        },
-        /* .required_cpu       = */ CPU_FEATURE_I8MM,
+        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
        /* .lhs_type           = */ GGML_TYPE_F32,
        /* .rhs_type           = */ GGML_TYPE_Q4_0,
        /* .op_type            = */ GGML_TYPE_F32,
@ -631,7 +630,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
            /* .pack_func_ex          = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
        },
-        /* .required_cpu       = */ CPU_FEATURE_I8MM,
+        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
        /* .lhs_type           = */ GGML_TYPE_F32,
        /* .rhs_type           = */ GGML_TYPE_Q4_0,
        /* .op_type            = */ GGML_TYPE_F32,
@ -801,7 +800,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
            /* .pack_func_ex          = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
        },
-        /* .required_cpu       = */ CPU_FEATURE_I8MM,
+        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
        /* .lhs_type           = */ GGML_TYPE_F32,
        /* .rhs_type           = */ GGML_TYPE_Q8_0,
        /* .op_type            = */ GGML_TYPE_F32,
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
--- a/Show More
+++ b/Show More