Merge 6cf18ecd6b into d006858316

ggml-webgpu: move from parameter buffer pool to single buffer with offsets (#21278 )
* Work towards removing bitcast * Move rest of existing types over * Add timeout back to wait and remove synchronous set_tensor/memset_tensor * move to unpackf16 for wider compatibility * cleanup * Remove deadlock condition in free_bufs * Start work on removing parameter buffer pools * Simplify and optimize further * simplify profile futures * Fix stride * Try using a single command buffer per batch * formatting
2026-04-04 02:06:49 +02:00 · 2026-04-03 11:40:14 -07:00 · 2026-04-03 20:16:44 +03:00 · 2026-04-03 19:02:27 +02:00 · 2026-04-03 17:51:52 +02:00 · 2026-04-03 17:51:23 +02:00
46 changed files with 4225 additions and 8022 deletions
--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@ -1,97 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=13.1.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_CUDA_RUN_CONTAINER} AS base
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -16,7 +16,7 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  curl,
+  openssl,
  shaderc,
  useBlas ?
    builtins.all (x: !x) [
@ -160,7 +160,8 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs;
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ [ openssl ];

  cmakeFlags =
    [
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -27,6 +27,11 @@ IBM zDNN:
        - any-glob-to-any-file:
            - ggml/include/ggml-zdnn.h
            - ggml/src/ggml-zdnn/**
+AMD ZenDNN:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-zendnn.h
+            - ggml/src/ggml-zendnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@ -213,6 +213,27 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

+  ggml-ci-win-intel-vulkan:
+    runs-on: [self-hosted, Windows, X64, Intel]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
+        env:
+          MSYSTEM: UCRT64
+          CHERE_INVOKING: 1
+          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
+        run: |
+          vulkaninfo --summary
+          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
+          # a valid python environment for testing
+          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
+
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -472,6 +472,7 @@ jobs:
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGPU_TARGETS="gfx1030" \
            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

@ -990,6 +991,7 @@ jobs:
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGPU_TARGETS="gfx1100"  `
            -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -73,10 +73,10 @@ jobs:
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@ -59,7 +59,7 @@ jobs:
        run: |
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
+            -DGPU_TARGETS=gfx942 \
            -DGGML_HIP=ON \
            -DGGML_HIP_EXPORT_METRICS=Off \
            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
--- a/ci/run.sh
+++ b/ci/run.sh
@ -119,6 +119,11 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
    fi

+    # Build shared libs on Windows
+    # to reduce binary size and avoid errors in library loading unit tests
+    if uname -s | grep -qi nt; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
+    fi
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@ -221,7 +226,7 @@ function gg_run_ctest_debug {

    set -e

-    # Check cmake and ctest are installed
+    # Check required binaries are installed
    gg_check_build_requirements

    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
@ -252,7 +257,7 @@ function gg_run_ctest_release {

    set -e

-    # Check cmake and ctest are installed
+    # Check required binaries are installed
    gg_check_build_requirements

    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
@ -627,10 +632,38 @@ function gg_sum_rerank_tiny {
 }

 function gg_check_build_requirements {
+    if ! command -v git &> /dev/null; then
+        gg_printf 'git not found, please install'
+    fi
+
+    if ! command -v git-lfs &> /dev/null; then
+        gg_printf 'git-lfs not found, please install'
+    fi
+
+    if ! command -v wget &> /dev/null; then
+        gg_printf 'wget not found, please install'
+    fi
+
+    if ! command -v python3 &> /dev/null; then
+        gg_printf 'python3 not found, please install'
+    fi
+
+    if ! command -v pip3 &> /dev/null; then
+        gg_printf 'pip3 not found, please install'
+    fi
+
+    if ! python3 -m ensurepip --help &> /dev/null; then
+        gg_printf 'ensurepip not found, please install python3-venv package'
+    fi
+
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi

+    if ! command -v ccache &> /dev/null; then
+        gg_printf 'ccache not found, please consider installing for faster builds'
+    fi
+
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1311,6 +1311,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.kv_unified = value;
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+    add_opt(common_arg(
+        {"--clear-idle"},
+        {"--no-clear-idle"},
+        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
+        [](common_params & params, bool value) {
+            params.clear_idle = value;
+        }
+    ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -6,6 +6,7 @@
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
+#include "peg-parser.h"

 #include <algorithm>
 #include <stdexcept>
@ -317,6 +318,44 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
           p.end();
 }

+common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
+                                                    const common_peg_parser & call_id_section, bool have_call_id,
+                                                    const common_peg_parser & args,
+                                                    std::optional<common_peg_parser> atomic_peek) const {
+    auto              open           = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
+    bool              matched_atomic = false;
+    common_peg_parser func_parser    = p.eps();
+
+    if (!function.name_suffix.empty()) {
+        func_parser    = open + call_id_section + p.space() + args;
+        matched_atomic = true;
+    } else if (have_call_id) {
+        func_parser    = p.atomic(open + call_id_section) + p.space() + args;
+        matched_atomic = true;
+    } else if (atomic_peek.has_value()) {
+        func_parser    = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
+        matched_atomic = true;
+    } else {
+        func_parser = open + call_id_section + p.space() + args;
+    }
+
+    if (!function.close.empty()) {
+        func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
+    } else if (!format.per_call_end.empty()) {
+        // When there's no func_close but there is a per_call_end marker, use peek() to ensure
+        // we only emit tool_close when we can actually see the closing marker. This prevents
+        // premature closing during partial parsing when we've seen e.g. "</" which could be
+        // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
+        func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
+    } else {
+        func_parser = func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
+    }
+    if (!matched_atomic) {
+        func_parser = p.atomic(func_parser);
+    }
+    return func_parser;
+}
+
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
@ -330,17 +369,27 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
        const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();

        // Build call_id parser based on position (if supported)
+        bool have_call_id = false;
        common_peg_parser call_id_section = p.eps();
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            !call_id.suffix.empty()) {
-            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
+            (!call_id.suffix.empty() || !arguments.start.empty())) {
+            if (!call_id.suffix.empty()) {
+                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
+            } else {
+                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
+            }
+            have_call_id = true;
+        }
+        auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
+        if (!arguments.start.empty()) {
+            args_parser = p.literal(arguments.start) + args_parser;
+        }
+        if (!arguments.end.empty()) {
+            args_parser = args_parser + p.literal(arguments.end);
        }

-        auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                           call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
-        if (!function.close.empty()) {
-            func_parser = func_parser + function.close;
-        }
+        auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
+        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
        tool_choice |= p.rule("tool-" + name, func_parser);
    });

@ -400,12 +449,34 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
        for (const auto & [param_name, param_schema] : properties.items()) {
            bool        is_required = required.find(param_name) != required.end();
            std::string type        = "object";
-            auto        type_obj    = param_schema.contains("type") ? param_schema.at("type") : json::object();
-            if (type_obj.is_string()) {
-                type_obj.get_to(type);
-            } else if (type_obj.is_object()) {
-                if (type_obj.contains("type") && type_obj.at("type").is_string()) {
-                    type_obj.at("type").get_to(type);
+            if (param_schema.contains("type")) {
+                const auto & type_obj = param_schema.at("type");
+                if (type_obj.is_string()) {
+                    type_obj.get_to(type);
+                } else if (type_obj.is_array()) {
+                    // Handle nullable types like ["string", "null"]
+                    for (const auto & t : type_obj) {
+                        if (t.is_string() && t.get<std::string>() != "null") {
+                            type = t.get<std::string>();
+                            break;
+                        }
+                    }
+                } else if (type_obj.is_object()) {
+                    if (type_obj.contains("type") && type_obj.at("type").is_string()) {
+                        type_obj.at("type").get_to(type);
+                    }
+                }
+            }
+            // Infer string type from enum values when type is unspecified
+            if (type == "object" && param_schema.contains("enum")) {
+                const auto & enum_vals = param_schema.at("enum");
+                if (enum_vals.is_array()) {
+                    for (const auto & v : enum_vals) {
+                        if (v.is_string()) {
+                            type = "string";
+                            break;
+                        }
+                    }
                }
            }

@ -448,52 +519,31 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
        }

+        if (!arguments.start.empty()) {
+            args_seq = p.literal(arguments.start) + args_seq;
+        }
+        if (!arguments.end.empty()) {
+            args_seq = args_seq + p.literal(arguments.end);
+        }
+
        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
        bool have_call_id = false;
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            !call_id.suffix.empty()) {
+            (!call_id.suffix.empty() || !arguments.start.empty())) {
            have_call_id = true;
-            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
-        }
-
-        bool matched_atomic = false;
-        common_peg_parser func_parser = p.eps();
-        if (!function.name_suffix.empty()) {
-            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + args_seq;
-            matched_atomic = true;
-        } else if (have_call_id) {
-            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section) + p.space() + args_seq;
-            matched_atomic = true;
-        } else if (!arguments.name_prefix.empty() && !required_parsers.empty()) {
-            // Only peek for an arg tag when there are required args that must follow.
-            // When all args are optional, the model may emit no arg tags at all (#20650).
-            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
-            matched_atomic = true;
-        } else {
-            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
-                call_id_section + p.space() + args_seq;
-        }
-
-        if (!function.close.empty()) {
-            func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
-        } else if (!format.per_call_end.empty()) {
-            // When there's no func_close but there is a per_call_end marker, use peek() to ensure
-            // we only emit tool_close when we can actually see the closing marker. This prevents
-            // premature closing during partial parsing when we've seen e.g. "</" which could be
-            // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
-            func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
-        } else {
-            func_parser =
-                func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
-        }
-        if (!matched_atomic) {
-            func_parser = p.atomic(func_parser);
+            if (!call_id.suffix.empty()) {
+                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
+            } else {
+                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
+            }
        }

+        // Only peek for an arg tag when there are required args that must follow.
+        // When all args are optional, the model may emit no arg tags at all (#20650).
+        auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
+            std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
+        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
        tool_choice |= p.rule("tool-" + name, func_parser);
    });

@ -574,9 +624,33 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
        std::vector<arg_entry> arg_entries;

        for (const auto & [param_name, param_schema] : properties.items()) {
-            std::string type    = "object";
-            auto        type_v  = param_schema.contains("type") ? param_schema.at("type") : json::object();
-            if (type_v.is_string()) type_v.get_to(type);
+            std::string type = "object";
+            if (param_schema.contains("type")) {
+                const auto & type_v = param_schema.at("type");
+                if (type_v.is_string()) {
+                    type_v.get_to(type);
+                } else if (type_v.is_array()) {
+                    // Handle nullable types like ["string", "null"]
+                    for (const auto & t : type_v) {
+                        if (t.is_string() && t.get<std::string>() != "null") {
+                            type = t.get<std::string>();
+                            break;
+                        }
+                    }
+                }
+            }
+            // Infer string type from enum values when type is unspecified
+            if (type == "object" && param_schema.contains("enum")) {
+                const auto & enum_vals = param_schema.at("enum");
+                if (enum_vals.is_array()) {
+                    for (const auto & v : enum_vals) {
+                        if (v.is_string()) {
+                            type = "string";
+                            break;
+                        }
+                    }
+                }
+            }

            common_peg_parser value_parser = p.eps();
            if (type == "string") {
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@ -1,7 +1,7 @@
 #pragma once

 #include "chat-auto-parser.h"
-#include "peg-parser.h"
+
 #include <functional>
 #include <optional>
 #include <string>
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -4,6 +4,7 @@
 #include "common.h"
 #include "jinja/caps.h"
 #include "peg-parser.h"
+#include "nlohmann/json.hpp"

 #include <chrono>
 #include <optional>
@ -355,6 +356,13 @@ struct analyze_tools : analyze_base {
    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
+
+    // Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
+    // atomic_peek: if present, used as the peek expression in the third atomicity branch.
+    common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
+                                        const common_peg_parser & call_id_section, bool have_call_id,
+                                        const common_peg_parser & args,
+                                        std::optional<common_peg_parser> atomic_peek) const;
    common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
 };

--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -25,6 +25,9 @@ static const std::string ARG_SECOND = "BB_ARG_SND_BB";
 static const std::string USER_MSG = "U_USER_MSG Hello END_U";
 static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
 static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
+static const std::string CALL_ID_001 = "call00001";
+static const std::string CALL_ID_002 = "call00002";
+static const std::string CALL_ID_999 = "call99999";

 static std::vector<std::function<void(const common_chat_template & tmpl, autoparser &)>> workarounds(
    { // Old reasoning Qwen templates - they don't really display reasoning content, but we still want to
@ -131,6 +134,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.tools.function.name_prefix  = "<｜tool▁sep｜>";
              analysis.tools.format.per_call_end   = "<｜tool▁call▁end｜>";
              analysis.tools.function.close        = "```";
+              LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
          }
      }
    });
@ -158,7 +162,7 @@ static json user_msg = json{
    { "content", USER_MSG }
 };

-static json build_tool_call(const std::string & name, const json & args, const std::string & id = "call00001") {
+static json build_tool_call(const std::string & name, const json & args, const std::string & id = CALL_ID_001) {
    return json{
        { "id",       id                                              },
        { "type",     "function"                                      },
@ -166,17 +170,17 @@ static json build_tool_call(const std::string & name, const json & args, const s
    };
 }

-static json first_tool_call_zero_args         = build_tool_call(FUN_FIRST, json::object(), "call00001");
-static json first_tool_call_one_arg           = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, "call00001");
-static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, "call00001");
-static json first_tool_call_other_arg         = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, "call00001");
+static json first_tool_call_zero_args         = build_tool_call(FUN_FIRST, json::object(), CALL_ID_001);
+static json first_tool_call_one_arg           = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, CALL_ID_001);
+static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, CALL_ID_001);
+static json first_tool_call_other_arg         = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, CALL_ID_001);

 static json first_tool_call =
-    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00001");
+    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_001);
 static json second_tool_call =
-    build_tool_call(FUN_SECOND, json{ { ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00002");
+    build_tool_call(FUN_SECOND, json{ { ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_002);
 static json first_tool_call_alt_id =
-    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call99999");
+    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_999);

 template <typename T>
 static std::string mode_to_str(T mode) {
@ -215,6 +219,11 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
    LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
    LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
+    LOG_DBG("call_id_prefix: '%s'\n", tools.call_id.prefix.c_str());
+    LOG_DBG("call_id_suffix: '%s'\n", tools.call_id.suffix.c_str());
+    LOG_DBG("call_id_pos: '%s'\n", mode_to_str(tools.call_id.pos).c_str());
+    LOG_DBG("args_start: '%s'\n", tools.arguments.start.c_str());
+    LOG_DBG("args_end: '%s'\n", tools.arguments.end.c_str());
    LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
    LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
    LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
@ -583,12 +592,15 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
        if (caps.supports_parallel_tool_calls) {
            check_per_call_markers();
        }
+        LOG_DBG(ANSI_ORANGE "Phase 3a: Function call analysis\n" ANSI_RESET);
        extract_function_markers();
+        LOG_DBG(ANSI_ORANGE "Phase 3b: Argument analysis\n" ANSI_RESET);
        if (format.mode == tool_format::TAG_WITH_TAGGED) {
            analyze_arguments();
        }
        extract_argument_separator();
        extract_args_markers();
+        LOG_DBG(ANSI_ORANGE "Phase 3c: Call id analysis\n" ANSI_RESET);
        extract_call_id_markers();
    }
 }
@ -979,8 +991,6 @@ void analyze_tools::extract_function_markers() {
 }

 void analyze_tools::analyze_arguments() {
-    LOG_DBG(ANSI_ORANGE "Phase 4: Argument analysis\n" ANSI_RESET);
-
    extract_argument_name_markers();
    extract_argument_value_markers();
 }
@ -1189,7 +1199,7 @@ void analyze_tools::extract_args_markers() {

    const auto & diff = comparison->diff;

-    if (format.mode != tool_format::JSON_NATIVE) {
+    if (format.mode == tool_format::JSON_NATIVE) {
        std::string prefix_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
        std::string suffix_marker = !format.section_end.empty() ? format.section_end : format.per_call_end;
        // these might happen earlier in the tools section as an example or somewhere else, so we need to find the closest ones
@ -1211,6 +1221,10 @@ void analyze_tools::extract_args_markers() {
            if (find_fun != std::string::npos) {
                args_start = args_start.substr(find_fun + FUN_FIRST.size(), args_start.size() - find_fun - FUN_FIRST.size());
            }
+            size_t find_call_id = args_start.find(CALL_ID_001);
+            if (find_call_id != std::string::npos) {
+                args_start = args_start.substr(find_call_id + CALL_ID_001.size(), args_start.size() - find_call_id - CALL_ID_001.size());
+            }
            arguments.start = args_start;
            arguments.end   = args_end;
        }
@ -1250,8 +1264,8 @@ void analyze_tools::extract_call_id_markers() {
        return;
    }

-    std::string id_value_1 = "call00001";
-    std::string id_value_2 = "call99999";
+    std::string id_value_1 = CALL_ID_001;
+    std::string id_value_2 = CALL_ID_999;

    size_t common_id_prefix_len = 0;
    for (size_t i = 0; i < std::min(id_value_1.length(), id_value_2.length()); i++) {
@ -1350,6 +1364,14 @@ void analyze_tools::extract_call_id_markers() {
        call_id.suffix = find_first_marker(before_func);
    }

+    if (call_id.prefix == arguments.end) {
+        call_id.prefix = "";
+    }
+
+    if (call_id.suffix == arguments.start) {
+        call_id.suffix = "";
+    }
+
    // When call_id is detected, per_call_end may have been incorrectly set to include
    // the call_id_suffix and sample args. Clear it if it starts with call_id_suffix.
    if (call_id.pos != call_id_position::NONE && !call_id.suffix.empty() &&
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -13,6 +13,8 @@
 #include "jinja/caps.h"
 #include "peg-parser.h"

+#include "nlohmann/json.hpp"
+
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
@ -762,12 +764,12 @@ static void foreach_parameter(const json &
    }
 }

-std::string common_chat_template_direct_apply(
+static std::string common_chat_template_direct_apply_impl(
    const common_chat_template & tmpl,
    const autoparser::generation_params & inputs,
-    const std::optional<json> & messages_override,
-    const std::optional<json> & tools_override,
-    const std::optional<json> & additional_context) {
+    const std::optional<json> & messages_override = std::nullopt,
+    const std::optional<json> & tools_override = std::nullopt,
+    const std::optional<json> & additional_context = std::nullopt) {
    jinja::context ctx(tmpl.source());

    nlohmann::ordered_json inp = nlohmann::ordered_json{
@ -814,6 +816,12 @@ std::string common_chat_template_direct_apply(
    return result;
 }

+std::string common_chat_template_direct_apply(
+    const common_chat_template & tmpl,
+    const autoparser::generation_params & inputs) {
+    return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
+}
+
 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
                                                              const autoparser::generation_params & inputs) {
    common_chat_params data;
@ -864,7 +872,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
    data.supports_thinking  = true;
    data.thinking_start_tag = "[THINK]";
    data.thinking_end_tag   = "[/THINK]";
-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens  = {
        "[THINK]",
@ -947,7 +955,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        adjusted_messages.push_back(msg);
    }

-    auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    auto prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);

    // Check if we need to replace the return token with end token during
    // inference and without generation prompt. For more details see:
@ -1074,7 +1082,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                                                                   const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt           = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = {
        ">>>all",
@ -1168,7 +1176,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt             = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking  = true;
    data.preserved_tokens  = {
@ -1291,7 +1299,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
                                                       const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@ -1370,7 +1378,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
                                                         const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@ -1441,7 +1449,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(

    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = false;
    data.preserved_tokens  = {
@ -1623,7 +1631,7 @@ static json common_chat_extra_context() {
    return ctx;
 }

-static std::optional<common_chat_params> try_specialized_template(
+std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        const autoparser::generation_params & params) {
@ -1724,9 +1732,9 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    }

    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply(tmpl, params);
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply(tmpl, params);
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
    params.generation_prompt     = diff.right;

@ -1760,7 +1768,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        common_chat_params data;
        auto params_copy               = params;
        params_copy.reasoning_format   = COMMON_REASONING_FORMAT_NONE;
-        data.prompt                    = common_chat_template_direct_apply(tmpl, params_copy);
+        data.prompt                    = common_chat_template_direct_apply_impl(tmpl, params_copy);
        data.format                    = COMMON_CHAT_FORMAT_PEG_NATIVE;
        data.generation_prompt         = params.generation_prompt;
        auto parser                    = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
@ -1770,7 +1778,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        return data;
    }

-    if (auto result = try_specialized_template(tmpl, src, params)) {
+    if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
        result->generation_prompt = params.generation_prompt;
        return *result;
    }
--- a/common/chat.h
+++ b/common/chat.h
@ -3,12 +3,12 @@
 #pragma once

 #include "common.h"
-#include "jinja/parser.h"
-#include "nlohmann/json_fwd.hpp"
 #include "peg-parser.h"
+#include "jinja/parser.h"
 #include "jinja/runtime.h"
 #include "jinja/caps.h"
-#include "nlohmann/json.hpp"
+
+#include "nlohmann/json_fwd.hpp"

 #include <chrono>
 #include <functional>
@ -19,8 +19,6 @@
 using chat_template_caps = jinja::caps;
 using json = nlohmann::ordered_json;

-#include <nlohmann/json_fwd.hpp>
-
 struct common_chat_templates;

 namespace autoparser {
@ -75,41 +73,9 @@ struct common_chat_template {
    const std::string & bos_token() const { return bos_tok; }
    const std::string & eos_token() const { return eos_tok; }

-    // TODO: this is ugly, refactor it somehow
-    json add_system(const json & messages, const std::string & system_prompt) const {
-        GGML_ASSERT(messages.is_array());
-        auto msgs_copy = messages;
-        if (!caps.supports_system_role) {
-            if (msgs_copy.empty()) {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "user"},
-                    {"content", system_prompt}
-                });
-            } else {
-                auto & first_msg = msgs_copy[0];
-                if (!first_msg.contains("content")) {
-                    first_msg["content"] = "";
-                }
-                first_msg["content"] = system_prompt + "\n\n"
-                    + first_msg["content"].get<std::string>();
-            }
-        } else {
-            if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "system"},
-                    {"content", system_prompt}
-                });
-            } else if (msgs_copy[0].at("role") == "system") {
-                msgs_copy[0]["content"] = system_prompt;
-            }
-        }
-        return msgs_copy;
-    }
-
    chat_template_caps original_caps() const {
        return caps;
    }
-
 };

 struct common_chat_msg {
@ -257,8 +223,8 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
                                                     const std::string &        bos_token_override = "",
                                                     const std::string &        eos_token_override = "");

-bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
+bool        common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");

 struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
                                                      const struct common_chat_templates_inputs & inputs);
@ -275,9 +241,9 @@ std::string common_chat_format_example(const struct common_chat_templates *
                                       bool                                       use_jinja,
                                       const std::map<std::string, std::string> & chat_template_kwargs);

-const char *            common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
+const char *    common_chat_format_name(common_chat_format format);
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
+common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);

 // used by arg and server
 const char *            common_reasoning_format_name(common_reasoning_format format);
@ -303,7 +269,9 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs,
-    const std::optional<json> & messages_override = std::nullopt,
-    const std::optional<json> & tools_override = std::nullopt,
-    const std::optional<json> & additional_context = std::nullopt);
+    const autoparser::generation_params & inputs);
+
+std::optional<common_chat_params> common_chat_try_specialized_template(
+        const common_chat_template &          tmpl,
+        const std::string &                   src,
+        const autoparser::generation_params & params);
--- a/common/common.h
+++ b/common/common.h
@ -579,8 +579,9 @@ struct common_params {
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt        = true;  // whether to enable prompt caching
-    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
+    bool    clear_idle          = true;  // save and clear idle slots upon starting a new task
+    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
+    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@ -306,6 +306,19 @@ value filter_expression::execute_impl(context & ctx) {
            filter_id = "strip"; // alias
        }
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
+        // TODO: Refactor filters so this coercion can be done automatically
+        if (!input->is_undefined() && !is_val<value_string>(input) && (
+            filter_id == "capitalize" ||
+            filter_id == "lower" ||
+            filter_id == "replace" ||
+            filter_id == "strip" ||
+            filter_id == "title" ||
+            filter_id == "upper" ||
+            filter_id == "wordcount"
+        )) {
+            JJ_DEBUG("Coercing %s to String for '%s' filter", input->type().c_str(), filter_id.c_str());
+            input = mk_val<value_string>(input->as_string());
+        }
        return try_builtin_func(ctx, filter_id, input)->invoke(func_args(ctx));

    } else if (is_stmt<call_expression>(filter)) {
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@ -465,8 +465,9 @@ const func_builtins & value_int_t::get_builtins() const {
            double val = static_cast<double>(args.get_pos(0)->as_int());
            return mk_val<value_float>(val);
        }},
-        {"tojson", tojson},
+        {"safe", tojson},
        {"string", tojson},
+        {"tojson", tojson},
    };
    return builtins;
 }
@ -485,8 +486,9 @@ const func_builtins & value_float_t::get_builtins() const {
            int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
            return mk_val<value_int>(val);
        }},
-        {"tojson", tojson},
+        {"safe", tojson},
        {"string", tojson},
+        {"tojson", tojson},
    };
    return builtins;
 }
@ -771,6 +773,11 @@ const func_builtins & value_string_t::get_builtins() const {


 const func_builtins & value_bool_t::get_builtins() const {
+    static const func_handler tostring = [](const func_args & args) -> value {
+        args.ensure_vals<value_bool>();
+        bool val = args.get_pos(0)->as_bool();
+        return mk_val<value_string>(val ? "True" : "False");
+    };
    static const func_builtins builtins = {
        {"default", default_value},
        {"int", [](const func_args & args) -> value {
@ -783,11 +790,8 @@ const func_builtins & value_bool_t::get_builtins() const {
            bool val = args.get_pos(0)->as_bool();
            return mk_val<value_float>(val ? 1.0 : 0.0);
        }},
-        {"string", [](const func_args & args) -> value {
-            args.ensure_vals<value_bool>();
-            bool val = args.get_pos(0)->as_bool();
-            return mk_val<value_string>(val ? "True" : "False");
-        }},
+        {"safe", tostring},
+        {"string", tostring},
        {"tojson", tojson},
    };
    return builtins;
@ -1100,18 +1104,14 @@ const func_builtins & value_object_t::get_builtins() const {
 }

 const func_builtins & value_none_t::get_builtins() const {
+    static const func_handler tostring = [](const func_args &) -> value {
+        return mk_val<value_string>("None");
+    };
    static const func_builtins builtins = {
        {"default", default_value},
        {"tojson", tojson},
-        {"string", [](const func_args &) -> value {
-            return mk_val<value_string>("None");
-        }},
-        {"safe", [](const func_args &) -> value {
-            return mk_val<value_string>("None");
-        }},
-        {"strip", [](const func_args &) -> value {
-            return mk_val<value_string>("None");
-        }},
+        {"string", tostring},
+        {"safe", tostring},
        {"items", empty_value_fn<value_array>},
        {"map", empty_value_fn<value_array>},
        {"reject", empty_value_fn<value_array>},
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@ -1561,7 +1561,23 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
        if (!s.schema) {
            return true;
        }
-        if (s.raw && s.schema->contains("type") && s.schema->at("type").is_string() && s.schema->at("type") == "string") {
+        if (s.raw && s.schema->contains("type")) {
+            const auto & type_val = s.schema->at("type");
+            if (type_val.is_string() && type_val == "string") {
+                return true;
+            }
+            // Handle nullable types like ["string", "null"] - delegate when the
+            // non-null type is string, since the tagged format uses raw text
+            if (type_val.is_array()) {
+                for (const auto & t : type_val) {
+                    if (t.is_string() && t.get<std::string>() != "null") {
+                        return t.get<std::string>() == "string";
+                    }
+                }
+            }
+        }
+        // Delegate for enum schemas in raw mode - enum values are literal strings
+        if (s.raw && !s.schema->contains("type") && s.schema->contains("enum")) {
            return true;
        }
        return false;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -7464,9 +7464,6 @@ class Gemma4Model(Gemma3Model):

        assert len(tokens) == vocab.vocab_size

-        # TODO @ngxson : there are some known (rare) issues with the tokenizer during development
-        # but I don't have time to dive into them right now;
-        # using a dedicated tokenizer name so that we can fix later without re-converting GGUF
        self.gguf_writer.add_tokenizer_model("gemma4")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@ -57,13 +57,14 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based

 ## Supported Operations

-The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
+The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-based matrix multiplication (MUL_MAT_ID)** operations. Other operations are handled by the standard CPU backend.

 | Operation    | Status  | Notes                                          |
 |:-------------|:-------:|:----------------------------------------------:|
 | MUL_MAT      | Support | Accelerated via ZenDNN LowOHA MatMul           |
+| MUL_MAT_ID   | Support | Accelerated via ZenDNN LowOHA MatMul (MoE)     |

-*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
+*Note:* Since MUL_MAT and MUL_MAT_ID are accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs and Mixture-of-Experts models).

 ## DataType Supports

@ -181,7 +182,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen

 ## Known Issues

- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
+- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
 - **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
 - **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.

@ -216,4 +217,4 @@ Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-t

 ## TODO

- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
+- Expand operation support beyond MUL_MAT and MUL_MAT_ID (attention operations, activations, etc.)
--- a/docs/build.md
+++ b/docs/build.md
@ -389,7 +389,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm


 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
+If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. Note that [`HSA_OVERRIDE_GFX_VERSION`] is [not supported on Windows](https://github.com/ROCm/ROCm/issues/2654)

 ### Unified Memory

--- a/docs/ops.md
+++ b/docs/ops.md
@ -68,7 +68,7 @@ Legend:
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/ZenDNN.csv
+++ b/docs/ops/ZenDNN.csv
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2202,6 +2202,26 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
    return use_mul_mat_vec_f;
 }

+static bool ggml_cuda_should_use_mmvq(ggml_type type, int cc, int64_t ncols_dst) {
+    if (ncols_dst > MMVQ_MAX_BATCH_SIZE) {
+        return false;
+    }
+
+    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+        switch (type) {
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+                return ncols_dst <= 4;
+            default:
+                break;
+        }
+    }
+
+    return true;
+}
+
 static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
    ggml_tensor *       src0 = tensor->src[0];
    ggml_tensor *       src1 = tensor->src[1];
@ -2211,11 +2231,11 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
                                   ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
                                   src0->view_src;

+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
-                             dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+                             dst->type == GGML_TYPE_F32 && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);

    // fusion is not universally faster on Pascal
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    if (cc <= GGML_CUDA_CC_PASCAL) {
        return false;
    }
@ -2272,6 +2292,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor

            const int cc            = ggml_cuda_info().devices[id].cc;
            const int warp_size     = ggml_cuda_info().devices[id].warp_size;
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
@ -2280,6 +2301,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    } else {
        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
        const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -360,8 +360,39 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
            }
        }

-        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
+        if (GGML_CUDA_CC_IS_RDNA4(cc)){
+            switch (type) {
+                case GGML_TYPE_IQ2_S:
+                case GGML_TYPE_Q6_K:
+                    return ne11 <= 128;
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
+                case GGML_TYPE_MXFP4:
+                    return true;
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_IQ3_XXS:
+                case GGML_TYPE_IQ3_S:
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_IQ1_S:
+                case GGML_TYPE_Q4_K:
+                    return ne11 <= 256;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_IQ4_NL:
+                case GGML_TYPE_IQ4_XS:
+                    return ne11 <= 512;
+
+                default:
+                    return false;
+
+            }
+
+            return false;
+        }
        return true;
    }

--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1009,8 +1009,8 @@ public:
    bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);

    struct stored_graph {
-        ggml_context_ptr ctx_ptr;
-        ggml_cgraph *    graph;
+        std::vector<uint8_t>   buffer;
+        ggml_cgraph          * graph;
    };

 private:
@ -1518,10 +1518,12 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
    LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);

    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
-
+    if (stored_graphs[device].buffer.size() < buf_size) {
+        stored_graphs[device].buffer.resize(buf_size);
+    }
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ NULL,
+        /*.mem_buffer =*/ stored_graphs[device].buffer.data(),
        /*.no_alloc   =*/ true,
    };
    ggml_context_ptr ctx_ptr { ggml_init(params) };
@ -1551,7 +1553,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
    }
    ggml_status status = ggml_backend_graph_compute(backends[device], graph);
    GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
-    stored_graphs[device].ctx_ptr.swap(ctx_ptr);
    stored_graphs[device].graph = graph;
    return true;
 }
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@ -437,12 +437,18 @@ inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_

    // Head-dim specializations used by the tuned vec f16 path.
    switch (key.head_dim_qk) {
-        case 64: return 2u;
-        case 96: return 4u;
-        case 128: return 1u;
-        case 192: return 2u;
-        case 576: return 2u;
-        default: return 1u;
+        case 64:
+            return 2u;
+        case 96:
+            return 4u;
+        case 128:
+            return 1u;
+        case 192:
+            return 2u;
+        case 576:
+            return 2u;
+        default:
+            return 1u;
    }
 }

@ -513,9 +519,9 @@ struct ggml_webgpu_flash_attn_blk_shader_lib_context {
 };

 inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_blk_shader(
-    pre_wgsl::Preprocessor &                                    preprocessor,
-    const char *                                                shader_src,
-    const ggml_webgpu_flash_attn_blk_shader_lib_context &       context) {
+    pre_wgsl::Preprocessor &                              preprocessor,
+    const char *                                          shader_src,
+    const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
    std::vector<std::string> defines;
    std::string              variant = "flash_attn_vec_blk";

@ -1857,9 +1863,8 @@ class ggml_webgpu_shader_lib {
        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));

        uint32_t q_tile  = context.sg_mat_m;
-        uint32_t kv_tile =
-            std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
-                     context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
+        uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
+                                    context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
        if (context.key.use_vec) {
            q_tile  = 1;
            kv_tile = std::max(context.sg_mat_n, std::min(32u, ggml_webgpu_flash_attn_max_kv_tile(context)));
@ -1885,14 +1890,14 @@ class ggml_webgpu_shader_lib {
        }
        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));

-        const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
+        const char *    shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
        webgpu_pipeline pipeline =
            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
-        auto decisions     = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
-        decisions->q_tile  = q_tile;
-        decisions->kv_tile = kv_tile;
-        decisions->wg_size = wg_size;
-        pipeline.context   = decisions;
+        auto decisions                    = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
+        decisions->q_tile                 = q_tile;
+        decisions->kv_tile                = kv_tile;
+        decisions->wg_size                = wg_size;
+        pipeline.context                  = decisions;
        flash_attn_pipelines[context.key] = pipeline;
        return flash_attn_pipelines[context.key];
    }
@ -1905,7 +1910,7 @@ class ggml_webgpu_shader_lib {

        ggml_webgpu_processed_shader processed =
            ggml_webgpu_preprocess_flash_attn_blk_shader(preprocessor, wgsl_flash_attn_vec_blk, context);
-        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
+        webgpu_pipeline pipeline              = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
        flash_attn_blk_pipelines[context.key] = pipeline;
        return flash_attn_blk_pipelines[context.key];
    }
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/ggml/src/ggml-zendnn/CMakeLists.txt
+++ b/ggml/src/ggml-zendnn/CMakeLists.txt
@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
    ExternalProject_Add(
        zendnn
        GIT_REPOSITORY https://github.com/amd/ZenDNN.git
-        GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf    # ZenDNN-2026-WW08
+        GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e    # ZenDNN-2026-WW13
        PREFIX      ${ZENDNN_PREFIX}
        SOURCE_DIR  ${ZENDNN_SOURCE_DIR}
        BINARY_DIR  ${ZENDNN_BUILD_DIR}
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@ -190,6 +190,170 @@ static void ggml_zendnn_compute_forward_mul_mat(
    }
 }

+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+
+static void ggml_zendnn_compute_forward_mul_mat_id(
+    ggml_backend_zendnn_context * ctx,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];  // expert weights
+    const ggml_tensor * src1 = dst->src[1];  // inputs
+    const ggml_tensor * ids  = dst->src[2];  // expert ids
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // exit for no tokens to process
+    if (ne2 == 0 || ne11 == 0) {
+        return;
+    }
+
+    ggml_type         const vec_dot_type = src0->type;
+    ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne03 == 1);
+    GGML_ASSERT(ne13 == 1);
+    GGML_ASSERT(ne3  == 1);
+
+    // row groups
+    const int n_ids = ids->ne[0]; // n_expert_used
+    const int n_as  = ne02;       // n_experts
+
+    std::vector<int64_t> matrix_row_counts(n_as, 0);
+    std::vector<std::vector<mmid_row_mapping>> matrix_rows(n_as);
+
+    int64_t max_rows = 0;
+    // group rows by expert (preprocessing step)
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+        for (int id = 0; id < n_ids; ++id) {
+            const int32_t i02 = *(const int32_t *)((const char *)ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            matrix_rows[i02].push_back({id, iid1});
+            matrix_row_counts[i02]++;
+            if (matrix_row_counts[i02] > max_rows) {
+                max_rows = matrix_row_counts[i02];
+            }
+        }
+    }
+
+    if (max_rows == 0) {
+        return; // no rows to process
+    }
+
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    // size for converting src1 rows to vec_dot_type if needed
+    const size_t nbw1 = row_size;
+    const size_t nbw2 = nbw1 * ne11;
+    const size_t nbw3 = nbw2 * ne12;
+    const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
+
+    // size for MoE gather/scatter buffers
+    const size_t wdata_cur_size = max_rows * row_size;
+    const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
+
+    // allocate single buffer for all needs
+    const size_t total_size = src1_conv_size + wdata_cur_size + dst_cur_size;
+    if (ctx->work_size < total_size) {
+        ctx->work_data.reset(new char[total_size]);
+        ctx->work_size = total_size;
+    }
+
+    // partition the buffer
+    char * work_data = ctx->work_data.get();
+    char * wdata_cur = work_data + src1_conv_size;
+    char * dst_cur = wdata_cur + wdata_cur_size;
+
+    if (src1->type != vec_dot_type) {
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        #pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    const float * src1_f32 = (float *)((char *)src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+                    void * src1_conv = (char *)work_data + i11*nbw1 + i12*nbw2 + i13*nbw3;
+                    from_float(src1_f32, src1_conv, ne10);
+                }
+            }
+        }
+    }
+
+    const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
+
+    // process each expert with gather -> gemm -> scatter pattern
+    for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int64_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
+
+        // gather input rows for this expert
+        #pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
+        for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
+            const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
+            const int64_t id = row_mapping.i1;
+            const int64_t i11 = id % ne11;
+            const int64_t i12 = row_mapping.i2;
+
+            std::memcpy(
+                wdata_cur + ir1 * row_size,
+                (const char *) wdata + (i11 + i12*ne11) * row_size,
+                row_size
+            );
+        }
+
+        // batched gemm for all tokens in this expert
+        if (!ggml_zendnn_sgemm(ctx,
+                              ne01,       // m
+                              cne1,       // n
+                              ne10,       // k
+                              src0_cur,
+                              ne00,       // lda
+                              wdata_cur,
+                              ne10,       // ldb
+                              dst_cur,
+                              ne01,       // ldc
+                              src0->type,
+                              vec_dot_type,
+                              dst->type)) {
+            GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
+        }
+
+        // scatter output rows to destination
+        #pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
+        for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
+            const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
+            const int64_t id = row_mapping.i1;
+            const int64_t i1 = id;
+            const int64_t i2 = row_mapping.i2;
+
+            std::memcpy(
+                (char *) dst->data + i1*nb1 + i2*nb2,
+                dst_cur + ir1 * ggml_row_size(dst->type, ne01),
+                ggml_row_size(dst->type, ne01)
+            );
+        }
+    }
+}
+
 // backend interface

 static const char * ggml_backend_zendnn_get_name(ggml_backend_t backend) {
@ -218,6 +382,9 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
            case GGML_OP_MUL_MAT:
                ggml_zendnn_compute_forward_mul_mat(ctx, node);
                break;
+            case GGML_OP_MUL_MAT_ID:
+                ggml_zendnn_compute_forward_mul_mat_id(ctx, node);
+                break;
            case GGML_OP_NONE:
            case GGML_OP_RESHAPE:
            case GGML_OP_VIEW:
@ -361,6 +528,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
            return true;

        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
        {
            const ggml_tensor * weights = op->src[0];
            const ggml_tensor * inputs = op->src[1];
@ -374,6 +542,17 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
                ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) {
                    return false;
            }
+            // MUL_MAT_ID performs best with a moderate number of experts due to its
+            // gather + batched matmul + scatter approach. Future versions will leverage
+            // ZenDNN's grouped_gemm for better scalability with larger expert counts:
+            // https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_group_gemm_operator.md
+            if (op->op == GGML_OP_MUL_MAT_ID) {
+                const int64_t n_experts = weights->ne[2];
+                const int64_t max_experts = 32;
+                if (n_experts > max_experts) {
+                    return false;
+                }
+            }
            switch (weights->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_BF16:
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(

    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);

-    // note: the SWA cache is never quantized because it is relatively small
    kv_swa = std::make_unique<llama_kv_cache>(
-            model, GGML_TYPE_F16, GGML_TYPE_F16,
+            model, type_k, type_v,
            v_trans, offload, unified, size_swa, n_seq_max, n_pad,
            hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
+                // Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
+                // normalizer, then BPE merges run on the whole text without
+                // word-level pre-splitting. We only need to split on newlines
+                // since BPE merge lookup asserts no newlines in tokens.
+                regex_exprs = {
+                    "[^\\n]+|[\\n]+",
+                };
+                byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
    }

    std::vector<std::string> regex_exprs;
+    bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
 };

 struct llm_tokenizer_bpe_session {
@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);

        symbols_final.clear();
+        auto tok_pre = vocab.get_pre_type();

        for (const auto & word : word_collection) {
            work_queue = llm_bigram_bpe::queue();
@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
            if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
                offset = word.size();
+            } else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
+                // fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
+                auto tok = vocab.text_to_token(word);
+                if (tok != LLAMA_TOKEN_NULL) {
+                    symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                    offset = word.size();
+                }
            }

            while (offset < word.size()) {
@ -1864,7 +1883,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_pad_id = 3;  // <|plamo:pad|>
            special_mask_id = LLAMA_TOKEN_NULL;
        } else if (tokenizer_model == "gemma4") {
-            type = LLAMA_VOCAB_TYPE_SPM;
+            type = LLAMA_VOCAB_TYPE_BPE;
+
+            // read bpe merges and populate bpe ranks
+            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
+            {
+                const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+                for (int i = 0; i < n_merges; i++) {
+                    const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+
+                    std::string first;
+                    std::string second;
+
+                    const size_t pos = word.find(' ', 1);
+
+                    if (pos != std::string::npos) {
+                        first  = word.substr(0, pos);
+                        second = word.substr(pos + 1);
+                    }
+
+                    bpe_ranks.emplace(std::make_pair(first, second), i);
+                }
+            }

            // default special tokens (to be read from GGUF)
            special_bos_id  = LLAMA_TOKEN_NULL;
@ -1874,7 +1917,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_pad_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;

-            tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_pre = "gemma4";
        } else {
            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
        }
@ -1882,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        // for now, only BPE models have pre-tokenizers
        if (type == LLAMA_VOCAB_TYPE_BPE) {
            add_space_prefix = false;
+            escape_whitespaces = false;
            clean_spaces = true;
            if (tokenizer_pre.empty()) {
                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@ -1948,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            } else if (
                    tokenizer_pre == "jais-2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
+            } else if (
+                    tokenizer_pre == "gemma4") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
+                escape_whitespaces = true;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-code" ||
@ -3045,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);

+                        if (escape_whitespaces) {
+                            llama_escape_whitespace(text);
+                        }
+
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
 #endif
@ -3224,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                    return _try_copy(token_text.data(), token_text.size());
                }
                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    if (escape_whitespaces) {
+                        // SPM-style BPE: tokens contain ▁ for spaces
+                        std::string result = token_text;
+                        llama_unescape_whitespace(result);
+                        return _try_copy(result.data(), result.size());
+                    }
                    std::string result = llama_decode_text(token_text);
                    return _try_copy(result.data(), result.size());
                }
@ -3654,9 +3712,7 @@ int llama_vocab::max_token_len() const {

 int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
-    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
-    GGML_ASSERT(token_right.find('\n') == std::string::npos);

    auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
    if (it == pimpl->bpe_ranks.end()) {
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_TINY_AYA        = 47,
    LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
    LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
+    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
 };

 struct LLM_KV;
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
    return false;
 }

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", unicode_cpt_flags::NUMBER },
@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        start += offset;
    }

-    return unicode_byte_encoding_process(bpe_words);
+    if (byte_encode) {
+        return unicode_byte_encoding_process(bpe_words);
+    }
+
+    return bpe_words;
 }
--- a/src/unicode.h
+++ b/src/unicode.h
@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);

 bool unicode_cpt_is_han(uint32_t cpt);

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -657,6 +657,66 @@ static common_chat_tool imaginary_number_tool{
    })",
 };

+static common_chat_tool nullable_string_tool{
+    /* .name = */ "set_nullable_str",
+    /* .description = */ "Set a nullable string value",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": ["string", "null"],
+                "description": "A nullable string"
+            }
+        },
+        "required": ["name"]
+    })",
+};
+
+static common_chat_tool nullable_string_null_first_tool{
+    /* .name = */ "set_nullable_str_nf",
+    /* .description = */ "Set a nullable string value with null first in type array",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": ["null", "string"],
+                "description": "A nullable string with null first"
+            }
+        },
+        "required": ["name"]
+    })",
+};
+
+static common_chat_tool nullable_int_tool{
+    /* .name = */ "set_nullable_int",
+    /* .description = */ "Set a nullable integer value",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "count": {
+                "type": ["integer", "null"],
+                "description": "A nullable integer"
+            }
+        },
+        "required": ["count"]
+    })",
+};
+
+static common_chat_tool enum_no_type_tool{
+    /* .name = */ "set_unit",
+    /* .description = */ "Set a temperature unit",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "unit": {
+                "enum": ["celsius", "fahrenheit"],
+                "description": "Temperature unit"
+            }
+        },
+        "required": ["unit"]
+    })",
+};
+
 static common_chat_tool string_param_tool{
    /* .name = */ "string_param",
    /* .description = */ "Tool with string parameter for testing",
@ -2200,6 +2260,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
                }
            })
            .run();
+
    }

    {
@ -2383,6 +2444,58 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            })
            .expect_reconstruction()
            .run();
+
+        // nullable string type ["string", "null"]
+        tst.test(
+               "<tool_call>\n"
+               "<function=set_nullable_str>\n"
+               "<parameter=name>\nhello world\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .tools({ nullable_string_tool })
+            .expect_tool_calls({
+                { "set_nullable_str", R"({"name": "hello world"})", {} },
+            })
+            .run();
+
+        // nullable string with null first in type array ["null", "string"]
+        tst.test(
+               "<tool_call>\n"
+               "<function=set_nullable_str_nf>\n"
+               "<parameter=name>\nhello world\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .tools({ nullable_string_null_first_tool })
+            .expect_tool_calls({
+                { "set_nullable_str_nf", R"({"name": "hello world"})", {} },
+            })
+            .run();
+
+        // nullable integer type ["integer", "null"] - should use JSON value path, not string
+        tst.test(
+               "<tool_call>\n"
+               "<function=set_nullable_int>\n"
+               "<parameter=count>\n42\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .tools({ nullable_int_tool })
+            .expect_tool_calls({
+                { "set_nullable_int", R"({"count": 42})", {} },
+            })
+            .run();
+
+        // enum without explicit type key - should infer string from enum values
+        tst.test(
+               "<tool_call>\n"
+               "<function=set_unit>\n"
+               "<parameter=unit>\ncelsius\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .tools({ enum_no_type_tool })
+            .expect_tool_calls({
+                { "set_unit", R"({"unit": "celsius"})", {} },
+            })
+            .run();
    }
    {
        auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
@ -2541,55 +2654,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
    // #20424 introduced effective_input = generation_prompt + input, but the throw
    // uses input.substr(result.end) where result.end is in effective_input space.
    {
-        auto tmpls = common_chat_templates_ptr(
-            common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
+        if (!g_template_filter.empty() && std::string("models/templates/GLM-4.7-Flash.jinja").find(g_template_filter) != std::string::npos) {
+            auto tmpls = common_chat_templates_ptr(
+                common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));

-        static common_chat_tool weather_tool{
-            "get_weather", "Get weather",
-            R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
-        };
+            static common_chat_tool weather_tool{
+                "get_weather", "Get weather",
+                R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
+            };

-        common_chat_templates_inputs inputs;
-        inputs.tools = { weather_tool };
-        inputs.enable_thinking = true;
-        inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-        inputs.add_generation_prompt = true;
-        inputs.use_jinja = true;
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = "get_weather";
-        inputs.messages = { msg };
+            common_chat_templates_inputs inputs;
+            inputs.tools = { weather_tool };
+            inputs.enable_thinking = true;
+            inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            inputs.add_generation_prompt = true;
+            inputs.use_jinja = true;
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = "get_weather";
+            inputs.messages = { msg };

-        auto params = common_chat_templates_apply(tmpls.get(), inputs);
-        common_peg_arena arena;
-        arena.load(params.parser);
-        common_chat_parser_params pp(params);
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            common_peg_arena arena;
+            arena.load(params.parser);
+            common_chat_parser_params pp(params);

-        // generation_prompt is non-empty for thinking models, so result.end
-        // will be offset by generation_prompt.size() into effective_input space.
-        assert(!pp.generation_prompt.empty());
+            // generation_prompt is non-empty for thinking models, so result.end
+            // will be offset by generation_prompt.size() into effective_input space.
+            assert(!pp.generation_prompt.empty());

-        std::string bad_input =
-            "Thinking.\n"
-            "</think>"
-            "<tool_call>get_weather"
-            "<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
-            "</tool_call>\n";
+            std::string bad_input =
+                "Thinking.\n"
+                "</think>"
+                "<tool_call>get_weather"
+                "<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
+                "</tool_call>\n";

-        bool got_runtime_error = false;
-        bool got_out_of_range = false;
-        std::string error_msg;
-        try {
-            common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
-        } catch (const std::out_of_range & e) {
-            got_out_of_range = true;
-            error_msg = e.what();
-        } catch (const std::runtime_error & e) {
-            got_runtime_error = true;
-            error_msg = e.what();
+            bool got_runtime_error = false;
+            bool got_out_of_range = false;
+            std::string error_msg;
+            try {
+                common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
+            } catch (const std::out_of_range & e) {
+                got_out_of_range = true;
+                error_msg = e.what();
+            } catch (const std::runtime_error & e) {
+                got_runtime_error = true;
+                error_msg = e.what();
+            }
+            GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
+            GGML_ASSERT(got_runtime_error  && "throw path should produce std::runtime_error with parse position");
        }
-        GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
-        GGML_ASSERT(got_runtime_error  && "throw path should produce std::runtime_error with parse position");
    }

    // Kimi-K2-Thinking tests - custom parser
@ -3169,6 +3284,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .expect(message_assist_call_id)
            .expect_reconstruction()
            .run();
+
+        tst.test("[TOOL_CALLS]special_function[CALL_ID]000000001[ARGS]{\"arg1\": 1}"
+            "[TOOL_CALLS]special_function_with_opt[CALL_ID]000000002[ARGS]{\"arg1\": 1, \"arg2\": 2}")
+            .parallel_tool_calls(true)
+            .tools({
+                special_function_tool, special_function_tool_with_optional_param
+            })
+            .expect_tool_calls({
+                { "special_function", R"({"arg1": 1})", "000000001" },
+                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", "000000002" },
+            })
+            .expect_reconstruction()
+            .run();
+
+
    }
    // Devstral
    {
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@ -523,6 +523,18 @@ static void test_filters(testing & t) {
        "hello"
    );

+    test_template(t, "upper array",
+        "{{ items|upper }}",
+        {{"items", json::array({"hello", "world"})}},
+        "['HELLO', 'WORLD']"
+    );
+
+    test_template(t, "upper dict",
+        "{{ items|upper }}",
+        {{"items", {{"hello", "world"}}}},
+        "{'HELLO': 'WORLD'}"
+    );
+
    test_template(t, "capitalize",
        "{{ 'heLlo World'|capitalize }}",
        json::object(),
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@ -176,8 +176,8 @@
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |

--- a/tools/parser/debug-template-parser.cpp
+++ b/tools/parser/debug-template-parser.cpp
@ -5,15 +5,15 @@
 #include "gguf.h"
 #include "jinja/runtime.h"
 #include "log.h"
+#include "nlohmann/json.hpp"
+#include "peg-parser.h"

 #include <fstream>
 #include <numeric>
+#include <optional>
 #include <sstream>
 #include <string>

-#include "nlohmann/json.hpp"
-#include "peg-parser.h"
-
 using json = nlohmann::ordered_json;

 enum class output_mode {
@ -34,14 +34,14 @@ enum class input_message_type {
 };

 struct debug_options {
-    std::string      template_path;
-    bool             with_tools        = true;
-    bool             generation_prompt = true;
-    bool             enable_reasoning  = true;
-    bool             debug_jinja       = false;
-    bool             force_tool_call   = false;
-    output_mode      mode              = output_mode::BOTH;
-    input_message_type input_message   = input_message_type::NONE;
+    std::string        template_path;
+    bool               with_tools        = true;
+    bool               generation_prompt = true;
+    bool               enable_reasoning  = true;
+    bool               debug_jinja       = false;
+    bool               force_tool_call   = false;
+    output_mode        mode              = output_mode::BOTH;
+    input_message_type input_message     = input_message_type::NONE;
 };

 static std::string read_file(const std::string & path) {
@ -274,7 +274,7 @@ static void render_scenario(const common_chat_template & tmpl,
    json final_messages = messages;
    if (add_generation_prompt && !messages.empty() && messages.back().value("role", "") == "assistant") {
        final_messages.push_back(json{
-            { "role",    "user" },
+            { "role",    "user"                                       },
            { "content", "Now please continue with another response." }
        });
    }
@ -305,7 +305,7 @@ static void render_all_scenarios(const common_chat_template & tmpl,
                                 const json &                 tools,
                                 bool                         add_generation_prompt,
                                 bool                         enable_thinking,
-                                 input_message_type             message_type) {
+                                 input_message_type           message_type) {
    json user_msg = build_user_message();

    auto render_if = [&](input_message_type type, const std::string & name, const json & assistant_msg) {
@ -335,6 +335,24 @@ static void render_all_scenarios(const common_chat_template & tmpl,
    }
 }

+static autoparser::generation_params prepare_params(const debug_options & opts, const json & tools) {
+    autoparser::generation_params params;
+    params.messages         = json::array({ build_user_message() });
+    params.reasoning_format = opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
+    params.enable_thinking  = opts.enable_reasoning;
+    params.add_generation_prompt = opts.generation_prompt;
+
+    if (opts.with_tools) {
+        params.tools       = tools;
+        params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
+    } else {
+        params.tools       = json();
+        params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
+    }
+    params.parallel_tool_calls = false;
+    return params;
+}
+
 int main(int argc, char ** argv) {
    // Set log level to most verbose to capture all debug output
    common_log_set_verbosity_thold(99);
@ -369,49 +387,41 @@ int main(int argc, char ** argv) {
    try {
        common_chat_template chat_template(template_source, "", "");

-        // Build tools definition
        json tools = opts.with_tools ? build_tools_definition() : json();

-        // Render template scenarios if requested
-        if (opts.input_message != input_message_type::NONE &&
-            (opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
+        autoparser::generation_params params = prepare_params(opts, tools);
+        common_chat_params            parser_data;
+        if (std::optional<common_chat_params> spec_tmpl =
+                common_chat_try_specialized_template(chat_template, template_source, params)) {
            LOG_ERR("\n");
-            LOG_ERR("================================================================================\n");
-            LOG_ERR("                         TEMPLATE RENDERING OUTPUT\n");
-            LOG_ERR("================================================================================\n");
+            LOG_ERR("This template uses a specialized parser, analysis results will not be available.");
+            parser_data = *spec_tmpl;
+        } else {
+            // Render template scenarios if requested
+            if (opts.input_message != input_message_type::NONE &&
+                (opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
+                LOG_ERR("\n");
+                LOG_ERR("================================================================================\n");
+                LOG_ERR("                         TEMPLATE RENDERING OUTPUT\n");
+                LOG_ERR("================================================================================\n");

-            render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
-                                 opts.input_message);
-        }
-
-        // Output analysis if requested
-        if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
-            LOG_ERR("\n");
-            LOG_ERR("================================================================================\n");
-            LOG_ERR("                           TEMPLATE ANALYSIS\n");
-            LOG_ERR("================================================================================\n");
-
-            autoparser::autoparser analysis;
-            analysis.analyze_template(chat_template);
-
-            // Generate Parser
-            autoparser::generation_params params;
-            params.messages = json::array({ build_user_message() });
-            params.reasoning_format =
-                opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
-            params.enable_thinking       = opts.enable_reasoning;
-            params.add_generation_prompt = opts.generation_prompt;
-
-            if (opts.with_tools) {
-                params.tools       = tools;
-                params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
-            } else {
-                params.tools       = json();
-                params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
+                render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
+                                     opts.input_message);
            }
-            params.parallel_tool_calls = false;

-            auto parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
+            // Output analysis if requested
+            if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
+                LOG_ERR("\n");
+                LOG_ERR("================================================================================\n");
+                LOG_ERR("                           TEMPLATE ANALYSIS\n");
+                LOG_ERR("================================================================================\n");
+
+                autoparser::autoparser analysis;
+                analysis.analyze_template(chat_template);
+
+                // Generate Parser
+                parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
+            }

            LOG_ERR("\n=== Generated Parser ===\n");
            common_peg_arena arena;
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -167,6 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
 | `-sp, --special` | special tokens output enabled (default: false) |
@ -221,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -605,6 +605,17 @@ private:
        llama_batch_free(batch);
    }

+    void slot_save_and_clear(server_slot & slot) {
+        if (slot.prompt.n_tokens() == 0) {
+            return;
+        }
+        SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
+        SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
+        slot.prompt_save(*prompt_cache);
+        slot.prompt_clear(false);
+        prompt_cache->update();
+    }
+
    void handle_sleeping_state(bool new_state) {
        GGML_ASSERT(sleeping != new_state);
        if (new_state) {
@ -864,6 +875,19 @@ private:

        metrics.init();

+        if (params_base.clear_idle) {
+            if (!params_base.kv_unified) {
+                SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
+                params_base.clear_idle = false;
+            } else if (params_base.cache_ram_mib == 0) {
+                SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
+                params_base.clear_idle = false;
+            } else {
+                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
+                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
+            }
+        }
+
        // populate webui settings
        {
            if (!params_base.webui_config_json.empty()) {
@ -1010,15 +1034,15 @@ private:
            // cache prompts only for completion tasks
            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;

-            // don't update the cache if the slot's context is empty
-            update_cache = update_cache && tokens.size() > 0;
-
            if (update_cache) {
                SRV_WRN("%s", "updating prompt cache\n");

                const int64_t t_start = ggml_time_us();

-                ret->prompt_save(*prompt_cache);
+                // don't save the slot's state if its context is empty
+                if (tokens.size() > 0) {
+                    ret->prompt_save(*prompt_cache);
+                }

                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
                    ret->prompt_clear(false);
@ -1692,9 +1716,7 @@ private:
                    const int id_slot = task.id_slot;
                    const int id_task = task.id;

-                    server_slot * slot = id_slot != -1
-                                            ? get_slot_by_id(id_slot)
-                                            : get_available_slot(task);
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);

                    //
                    // slot scheduling logic
@ -1731,6 +1753,14 @@ private:
                        SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
                        break; // drop the task
                    }
+
+                    if (params_base.clear_idle) {
+                        for (auto & s : slots) {
+                            if (!s.is_processing()) {
+                                slot_save_and_clear(s);
+                            }
+                        }
+                    }
                } break;
            case SERVER_TASK_TYPE_CANCEL:
                {
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -2008,7 +2008,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
 bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
    const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);

-    float f_keep_best = float(lcp_best) / prompt.tokens.size();
+    float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
    float sim_best    = float(lcp_best) / tokens_new.size();

    SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@ -0,0 +1,115 @@
+import os
+import tempfile
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+class LogReader:
+    def __init__(self, path):
+        self.path = path
+        self.pos = 0
+    def drain(self):
+        with open(self.path) as f:
+            f.seek(self.pos)
+            content = f.read()
+            self.pos = f.tell()
+        return content
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_slots = 2
+    server.n_predict = 4
+    server.temperature = 0.0
+    server.server_slots = True
+    server.cache_ram = 100
+    server.kv_unified = True
+    server.debug = True
+    fd, server.log_path = tempfile.mkstemp(suffix='.log')
+    os.close(fd)
+    yield
+
+
+LONG_PROMPT = (
+    "Once upon a time in a land far away, there lived a brave knight "
+    "who traveled across mountains and rivers to find the legendary "
+    "golden sword hidden deep within the enchanted forest of whispers. "
+    "He met many creatures along the way including dragons and fairies "
+    "and wizards who helped him on his noble quest to save the kingdom."
+)
+
+
+# idle slot cleared on launch should restore from cache-ram
+def test_clear_and_restore():
+    global server
+    server.start()
+    log = LogReader(server.log_path)
+
+    # verify feature is enabled
+    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    original_prompt_n = res.body["timings"]["prompt_n"]
+
+    # Slot 0 is the only slot with KV — should NOT be cleared
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+
+    # Launching slot 1 clears idle slot 0
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
+
+    # Re-send same prompt — should restore from cache-ram
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "updating prompt cache" in log.drain()
+    assert res.body["timings"]["cache_n"] > 0
+    assert res.body["timings"]["prompt_n"] < original_prompt_n
+
+    # Follow-up — slot 0 kept its KV, no clearing needed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+
+
+def test_disabled_with_flag():
+    global server
+    server.no_clear_idle = True
+    server.start()
+    log = LogReader(server.log_path)
+
+    # Feature should not be enabled
+    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+
+    # Request on different slot — should NOT trigger clearing
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -102,6 +102,9 @@ class ServerProcess:
    mmproj_url: str | None = None
    media_path: str | None = None
    sleep_idle_seconds: int | None = None
+    cache_ram: int | None = None
+    no_clear_idle: bool = False
+    log_path: str | None = None
    webui_mcp_proxy: bool = False

    # session variables
@ -237,6 +240,10 @@ class ServerProcess:
            server_args.extend(["--media-path", self.media_path])
        if self.sleep_idle_seconds is not None:
            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
+        if self.cache_ram is not None:
+            server_args.extend(["--cache-ram", self.cache_ram])
+        if self.no_clear_idle:
+            server_args.append("--no-clear-idle")
        if self.webui_mcp_proxy:
            server_args.append("--webui-mcp-proxy")

@ -249,11 +256,16 @@ class ServerProcess:
            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
            flags |= subprocess.CREATE_NO_WINDOW

+        if self.log_path:
+            self._log = open(self.log_path, "w")
+        else:
+            self._log = sys.stdout
+
        self.process = subprocess.Popen(
            [str(arg) for arg in [server_path, *server_args]],
            creationflags=flags,
-            stdout=sys.stdout,
-            stderr=sys.stdout,
+            stdout=self._log,
+            stderr=self._log if self._log != sys.stdout else sys.stdout,
            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
        )
        server_instances.add(self)
@ -298,6 +310,8 @@ class ServerProcess:
            except Exception as e:
                print(f"Error waiting for server: {e}")
            self.process = None
+        if hasattr(self, '_log') and self._log != sys.stdout:
+            self._log.close()

    def make_request(
        self,
Author	SHA1	Message	Date
Jiacheng (Jason) Chen	a44a875f8c	Merge `6cf18ecd6b` into `d006858316`	2026-04-04 02:06:49 +02:00
Reese Levine	d006858316	ggml-webgpu: move from parameter buffer pool to single buffer with offsets (#21278 ) * Work towards removing bitcast * Move rest of existing types over * Add timeout back to wait and remove synchronous set_tensor/memset_tensor * move to unpackf16 for wider compatibility * cleanup * Remove deadlock condition in free_bufs * Start work on removing parameter buffer pools * Simplify and optimize further * simplify profile futures * Fix stride * Try using a single command buffer per batch * formatting	2026-04-03 11:40:14 -07:00
Masato Nakasaka	e439700992	ci: Add Windows Vulkan backend testing on Intel (#21292 ) * experimenting CI * Experimenting CI fix for MinGW * experimenting CI on Windows * modified script for integration with VisualStudio * added proxy handling * adding python version for Windows execution * fix iterator::end() dereference * fixed proxy handling * Fix errors occurring on Windows * fixed ci script * Reverted to master * Stripping test items to simplify Windows test * adjusting script for windows testing * Changed shell * Fixed shell * Fixed shell * Fix CI setting * Fix CI setting * Fix CI setting * Experimenting ci fix * Experimenting ci fix * Experimenting ci fix * Experimenting ci fix * experimenting fix for unit test error * Changed to use BUILD_LOW_PERF to skip python tests * Fix CI * Added option to specify Ninja generator * Reverted proxy related changes	2026-04-03 20:16:44 +03:00
Yes You Can Have Your Own	50e0ad08fb	server: save and clear idle slots on new task (`--clear-idle`) (#20993 ) * server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE) * server: move idle slot KV clearing to slot release The save "cost" is now paid by the finishing request. * server: add --kv-clear-idle flag, enable by default * server: skip clearing last idle slot, clear on launch * server: test --no-kv-clear-idle flag * server: simplify on-release clearing loop * server: remove on-release KV clearing, keep launch-only * cont : clean-up * tests: update log strings after --clear-idle rename * tests: use debug tags instead of log message matching * test: fix Windows CI by dropping temp log file unlink --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-04-03 19:02:27 +02:00
Piotr Wilkin (ilintar)	f1f793ad06	common/parser: fix call ID detection (Mistral parser mostly) + atomicity for tag-json parsers (#21230 ) * Fix call ID detection (Mistral parser mostly) + atomicity for tag-json parsers * Rename * Update common/chat-auto-parser-generator.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-03 17:51:52 +02:00
Samanvya Tripathi	af5c13841f	common : fix tool call type detection for nullable and enum schemas (#21327 ) * common : fix tool call type detection for nullable and enum schemas * common, tests : fix grammar delegation for nullable/enum schemas and add tests Fix enum type inference to scan all enum values (not just index 0) so schemas like {"enum": [0, "celsius"]} correctly detect string type. Fix schema_delegates in peg-parser to handle nullable type arrays (["string", "null"]) and typeless enum schemas in raw mode, allowing the tagged parser to use raw text instead of JSON-formatted strings. Add test cases for Qwen3-Coder (TAG_WITH_TAGGED format): - nullable string ["string", "null"] - nullable string with null first ["null", "string"] - nullable integer ["integer", "null"] - enum without explicit type key	2026-04-03 17:51:23 +02:00
M1DNYT3	277ff5fff7	docker : bump cuda12 to 12.9.1 (#20920 ) Co-authored-by: M1DNYT3 <m1dnyt3@MacBookPro.lan> Co-authored-by: CISC <CISC@users.noreply.github.com>	2026-04-03 15:06:45 +02:00
jeromew	384c0076bc	docs: Update build.md: HSA_OVERRIDE_GFX_VERSION clarification (#21331 ) The `HSA_OVERRIDE_GFX_VERSION` variable can be used in ROCm to override an unsupported target architecture with a similar but supported target architecture. This does not and has never worked on Windows. I think the clarification could avoid driving Windows people towards this solution that does not work.	2026-04-03 21:05:14 +08:00
Sigbjørn Skjæret	1f34806c44	jinja: coerce input for string-specific filters (#21370 )	2026-04-03 15:03:33 +02:00
Aaron Teo	887535c33f	ci: add more binary checks (#21349 )	2026-04-03 20:50:00 +08:00
Piotr Wilkin (ilintar)	d3416a4aa9	fix: remove stale assert (#21369 )	2026-04-03 13:40:41 +02:00
uvos	43a4ee4a2c	HIP: build eatch ci build test for a different architecture (#21337 ) This helps improve our chances of finding build failures before the release workflow builds for all architectures.	2026-04-03 11:38:22 +02:00
Tillerino	f851fa5ab0	fix: add openssl to nix dependencies (#21353 ) (#21355 )	2026-04-03 12:21:07 +03:00
Vishal Singh	f1ac84119c	ggml-zendnn : add MUL_MAT_ID op support for MoE models (#21315 ) * ggml-zendnn : add MUL_MAT_ID op support for MoE models - Add MUL_MAT_ID op acceleration for Mixture-of-Experts models - MUL_MAT_ID op fallback to CPU backend if total experts > 32 - Point ZenDNN lib to latest bits ZenDNN-2026-WW13 * ggml-zendnn : add braces to sgemm failure condition for consistency Co-authored-by: Aaron Teo <taronaeo@gmail.com> --------- Co-authored-by: Aaron Teo <taronaeo@gmail.com>	2026-04-03 12:19:08 +03:00
Piotr Wilkin (ilintar)	b069b10ab4	vocab: fix Gemma4 tokenizer (#21343 ) * seems to work * fix case with new line Co-authored-by: sayap <sokann@gmail.com> * gemma 4: fix pre tok regex --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: sayap <sokann@gmail.com>	2026-04-03 10:33:03 +02:00
Radoslav Gerganov	0c58ba3365	rpc : reuse compute graph buffers (#21299 ) Reuse the buffer for the ggml context which is used for creating the compute graph on the server side. This partially addresses a memory leak created by the CUDA backend due to using buffer addresses as cache keys. ref: #21265 ref: #20315	2026-04-03 10:28:09 +03:00
Georgi Gerganov	57ace0d612	chat : avoid including json in chat.h (#21306 )	2026-04-03 09:07:59 +03:00
Georgi Gerganov	39b27f0da0	(revert) kv-cache : do not quantize SWA KV cache (#21332 ) This reverts commit `17193cce34`.	2026-04-03 09:07:01 +03:00
Vishal Singh	f49e917876	ci : add AMD ZenDNN label to PR labeler (#21345 ) * ci : add AMD CPU label to PR labeler Add automatic labeling for PRs that modify AMD CPU (ZenDNN) backend files * ci : rename label AMD CPU to AMD ZenDNN in labeler config Co-authored-by: Aaron Teo <taronaeo@gmail.com> --------- Co-authored-by: Aaron Teo <taronaeo@gmail.com>	2026-04-03 10:35:15 +08:00
jiachengjason	6cf18ecd6b	remove extra comments	2026-02-03 15:04:56 -05:00
Jiacheng (Jason) Chen	a1b7063926	Merge branch 'ggml-org:master' into fix/jiachengjason/rocm7.x_regression	2026-02-03 10:40:29 -05:00
jiachengjason	08d444578d	fine tuned gpt-oss configs	2026-01-26 17:51:47 -05:00
jiachengjason	ec3fce1512	Merge remote-tracking branch 'origin/master' into fix/jiachengjason/rocm7.x_regression	2026-01-23 17:20:58 -05:00
jiachengjason	4d5b79970c	overall further tuning for all models	2026-01-23 17:15:01 -05:00
jiachengjason	57e1eaf716	additional tuning for qwen models	2026-01-19 12:31:42 -05:00
jiachengjason	b2d628dc51	tune mmq/rocblas switching for RDNA4	2026-01-13 11:09:50 -05:00