Merge 9f7ce433aa into 43a4ee4a2c

HIP: build eatch ci build test for a different architecture (#21337 )
This helps improve our chances of finding build failures before the release workflow builds for all architectures.
2026-04-03 13:16:36 +02:00 · 2026-04-03 11:38:22 +02:00 · 2026-04-03 12:21:07 +03:00 · 2026-04-03 12:19:08 +03:00 · 2026-04-03 10:33:03 +02:00 · 2026-04-03 10:28:09 +03:00
30 changed files with 3776 additions and 7339 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -16,7 +16,7 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  curl,
+  openssl,
  shaderc,
  useBlas ?
    builtins.all (x: !x) [
@ -160,7 +160,8 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs;
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ [ openssl ];

  cmakeFlags =
    [
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.2
-ARG AMDGPU_VERSION=7.2
+ARG ROCM_VERSION=7.2.1
+ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@ -12,11 +12,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # This is mostly tied to rocBLAS supported archs.
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
 # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
 # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html

-ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
+ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'

 # Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -27,6 +27,11 @@ IBM zDNN:
        - any-glob-to-any-file:
            - ggml/include/ggml-zdnn.h
            - ggml/src/ggml-zdnn/**
+AMD ZenDNN:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-zendnn.h
+            - ggml/src/ggml-zendnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -472,6 +472,7 @@ jobs:
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGPU_TARGETS="gfx1030" \
            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

@ -941,7 +942,7 @@ jobs:
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

@ -984,12 +985,13 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
            -DCMAKE_BUILD_TYPE=Release `
            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGPU_TARGETS="gfx1100"  `
            -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@ -35,7 +35,7 @@ env:
 jobs:
  ubuntu-22-hip-quality-check:
    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:7.2
+    container: rocm/dev-ubuntu-22.04:7.2.1
    steps:
      - name: Clone
        id: checkout
@ -59,7 +59,7 @@ jobs:
        run: |
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
+            -DGPU_TARGETS=gfx942 \
            -DGGML_HIP=ON \
            -DGGML_HIP_EXPORT_METRICS=Off \
            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -639,8 +639,8 @@ jobs:
    strategy:
      matrix:
        include:
-          - ROCM_VERSION: "7.2"
-            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
+          - ROCM_VERSION: "7.2.1"
+            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
            build: 'x64'

    steps:
@ -662,7 +662,7 @@ jobs:
          sudo apt install -y build-essential git cmake wget

      - name: Setup Legacy ROCm
-        if: matrix.ROCM_VERSION == '7.2'
+        if: matrix.ROCM_VERSION == '7.2.1'
        id: legacy_env
        run: |
          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
@ -683,7 +683,7 @@ jobs:
          sudo apt-get install -y libssl-dev rocm-hip-sdk

      - name: Setup TheRock
-        if: matrix.ROCM_VERSION != '7.2'
+        if: matrix.ROCM_VERSION != '7.2.1'
        id: therock_env
        run: |
          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
@ -699,7 +699,6 @@ jobs:
        run: |
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_BACKEND_DL=ON \
            -DGGML_NATIVE=OFF \
@ -717,17 +716,20 @@ jobs:
        id: tag
        uses: ./.github/actions/get-tag-name

+      - name: Get ROCm short version
+        run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
+
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
+          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
    runs-on: windows-2022
@ -749,7 +751,7 @@ jobs:
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

@ -806,7 +808,7 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -7,11 +7,109 @@
 #include "log.h"
 #include "nlohmann/json.hpp"

+#include <algorithm>
 #include <stdexcept>
 #include <string>

 using json = nlohmann::ordered_json;

+namespace {
+
+// Gemma4-specific PEG builder extending the standard chat builder.
+// Adds value type parsers that use <|\"|> as string delimiters
+// instead of JSON's double quotes, and disables json-to-schema
+// conversion for these types.
+class common_peg_gemma4_builder {
+    common_chat_peg_builder & p_;
+    static constexpr const char * QUOTE = "<|\"|>";
+
+public:
+    explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
+
+    common_peg_parser gemma4_string() {
+        return p_.rule("gemma4-string", [&]() {
+            return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
+        });
+    }
+
+    common_peg_parser gemma4_number() {
+        return p_.rule("gemma4-number", [&]() {
+            auto digit1_9 = p_.chars("[1-9]", 1, 1);
+            auto digits   = p_.chars("[0-9]");
+            auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
+            auto frac     = p_.sequence({p_.literal("."), digits});
+            auto exp      = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
+                                         p_.optional(p_.chars("[+-]", 1, 1)), digits});
+            auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
+            return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
+                                p_.optional(exp), not_number_continuation});
+        });
+    }
+
+    common_peg_parser gemma4_bool() {
+        return p_.rule("gemma4-bool", [&]() {
+            return p_.choice({p_.literal("true"), p_.literal("false")});
+        });
+    }
+
+    common_peg_parser gemma4_null() {
+        return p_.rule("gemma4-null", [&]() {
+            return p_.literal("null");
+        });
+    }
+
+    common_peg_parser gemma4_dict() {
+        return p_.rule("gemma4-dict", [&]() {
+            auto ws = p_.space();
+            auto key = p_.until(":");
+            auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
+            auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
+            return p_.sequence({
+                p_.literal("{"), ws,
+                p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
+            });
+        });
+    }
+
+    common_peg_parser gemma4_array() {
+        return p_.rule("gemma4-array", [&]() {
+            auto ws = p_.space();
+            auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
+            return p_.sequence({
+                p_.literal("["), ws,
+                p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
+            });
+        });
+    }
+
+    common_peg_parser gemma4_value() {
+        return p_.rule("gemma4-value", [&]() {
+            return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
+                              gemma4_number(), gemma4_bool(), gemma4_null()});
+        });
+    }
+
+    // Select the appropriate value parser based on JSON schema type.
+    // Does NOT use schema() - the gemma4 types are pure PEG without
+    // JSON schema metadata, so GBNF is generated directly from the
+    // PEG structure.
+    common_peg_parser gemma4_value_for_type(const json & schema) {
+        if (!schema.contains("type") || !schema.at("type").is_string()) {
+            return gemma4_value();
+        }
+        std::string type = schema.at("type").get<std::string>();
+        if (type == "string")  { return gemma4_string(); }
+        if (type == "number")  { return gemma4_number(); }
+        if (type == "integer") { return gemma4_number(); }
+        if (type == "boolean") { return gemma4_bool(); }
+        if (type == "object")  { return gemma4_dict(); }
+        if (type == "array")   { return gemma4_array(); }
+        return gemma4_value();
+    }
+};
+
+}  // anonymous namespace
+
 // Helper to iterate over tools/functions
 static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
    for (const auto & tool : tools) {
@ -43,7 +141,9 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    // Create the result structure
    common_chat_params data;
    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.format           = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
+                            ? COMMON_CHAT_FORMAT_PEG_GEMMA4
+                            : COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;

    auto parser = autoparser.build_parser(inputs);
@ -92,6 +192,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons

        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;
+        ctx.reasoning            = &reasoning;

        // Build reasoning parser
        ctx.reasoning_parser = reasoning.build_parser(ctx);
@ -440,7 +541,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

-    // The Gemma4 string quote token used in place of JSON "
+    common_peg_gemma4_builder g4(p);
    static const std::string QUOTE = "<|\"|>";

    common_peg_parser tool_choice = p.choice();
@ -451,7 +552,6 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
        const auto & params = func.at("parameters");

        if (!params.contains("properties") || !params.at("properties").is_object()) {
-            // No arguments - just match the function name with empty braces
            auto func_parser = p.atomic(
                p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
                p.tool_args(p.eps()) +
@ -486,9 +586,18 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
                    p.tool_arg_string_value(p.schema(p.until(QUOTE),
                        "tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
                    p.literal(QUOTE);
+            } else if (type == "number" || type == "integer") {
+                value_parser = p.tool_arg_value(g4.gemma4_number());
+            } else if (type == "boolean") {
+                value_parser = p.tool_arg_value(g4.gemma4_bool());
+            } else if (type == "null") {
+                value_parser = p.tool_arg_value(g4.gemma4_null());
+            } else if (type == "object") {
+                value_parser = p.tool_arg_value(g4.gemma4_dict());
+            } else if (type == "array") {
+                value_parser = p.tool_arg_value(g4.gemma4_array());
            } else {
-                // Numbers, booleans: raw text up to the next comma or closing brace
-                value_parser = p.tool_arg_value(p.until_one_of({",", "}"}));
+                value_parser = p.tool_arg_value(g4.gemma4_value());
            }

            auto arg = p.tool_arg(
@ -538,9 +647,9 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
        tool_calls = p.optional(tool_calls);
    }

-    auto content_before_tools = p.until(format.per_call_start);
+    auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
    return ctx.reasoning_parser +
-           (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) +
+           (force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
           tool_calls + p.end();
 }

--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@ -1,7 +1,7 @@
 #pragma once

 #include "chat-auto-parser.h"
-#include "peg-parser.h"
+
 #include <functional>
 #include <optional>
 #include <string>
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -4,6 +4,7 @@
 #include "common.h"
 #include "jinja/caps.h"
 #include "peg-parser.h"
+#include "nlohmann/json.hpp"

 #include <chrono>
 #include <optional>
@ -215,12 +216,14 @@ struct tool_id_analysis {
 // ============================================================================

 struct analyze_content;
+struct analyze_reasoning;

 struct parser_build_context {
    common_chat_peg_builder & p;
-    const generation_params &          inputs;
+    const generation_params &         inputs;
    common_peg_parser                 reasoning_parser;
    bool                              extracting_reasoning = false;
+    const analyze_reasoning *         reasoning            = nullptr;
    const analyze_content *           content              = nullptr;

    parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -104,10 +104,11 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.tools.function.name_suffix  = "";
              analysis.tools.arguments.start       = "{";
              analysis.tools.arguments.end         = "}";
+              analysis.tools.arguments.name_prefix = "";
              analysis.tools.arguments.name_suffix = ":";
              analysis.tools.arguments.separator   = ",";
              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
-              analysis.reasoning.start             = "<|channel>thought\n";
+              analysis.reasoning.start             = "<|channel>thought";
              analysis.reasoning.end               = "<channel|>";
              analysis.preserved_tokens.clear();
              analysis.preserved_tokens.push_back("<|tool_call>");
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -75,6 +75,84 @@ static std::string escape_json_string_inner(const std::string & s) {
    return escaped;
 }

+static const std::string GEMMA4_QUOTE = "<|\"|>";
+
+static std::string normalize_gemma4_to_json(const std::string & input) {
+    std::string result;
+    result.reserve(input.size() * 2);
+
+    enum Ctx { DICT, ARRAY };
+    std::vector<Ctx> ctx;
+
+    auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
+    auto skip_ws = [&](size_t & pos) {
+        while (pos < input.size() && is_ws(input[pos])) {
+            result += input[pos++];
+        }
+    };
+
+    auto quote_unquoted_key = [&](size_t & pos) {
+        if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
+            result += '"';
+            while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
+                result += input[pos++];
+            }
+            result += '"';
+            skip_ws(pos);
+        }
+    };
+
+    size_t i = 0;
+    while (i < input.size()) {
+        if (i + GEMMA4_QUOTE.size() <= input.size() &&
+            input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
+            result += '"';
+            i += GEMMA4_QUOTE.size();
+            continue;
+        }
+
+        char c = input[i];
+
+        if (c == '{') {
+            result += c;
+            ctx.push_back(DICT);
+            ++i;
+            skip_ws(i);
+            quote_unquoted_key(i);
+            continue;
+        }
+        if (c == '}') {
+            result += c;
+            if (!ctx.empty()) ctx.pop_back();
+            ++i;
+            continue;
+        }
+        if (c == '[') {
+            result += c;
+            ctx.push_back(ARRAY);
+            ++i;
+            continue;
+        }
+        if (c == ']') {
+            result += c;
+            if (!ctx.empty()) ctx.pop_back();
+            ++i;
+            continue;
+        }
+        if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
+            result += c;
+            ++i;
+            skip_ws(i);
+            quote_unquoted_key(i);
+            continue;
+        }
+
+        result += c;
+        ++i;
+    }
+    return result;
+}
+
 // Convert Python-style single-quoted strings to JSON double-quoted strings
 // Only converts outer string delimiters, properly handling escape sequences:
 // - {'key': 'value'} -> {"key": "value"}
@ -214,6 +292,14 @@ std::string & common_chat_peg_mapper::args_target() {
    return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
 }

+std::string common_chat_peg_mapper::normalize_container_value(const std::string & input) {
+    return normalize_quotes_to_json(input);
+}
+
+std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
+    return normalize_quotes_to_json(normalize_gemma4_to_json(input));
+}
+
 void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
                                      const common_peg_parse_result & parse_result_arg) {
    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
@ -352,7 +438,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            // For potential containers, normalize Python-style single quotes to JSON double quotes
            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
            if (is_potential_container) {
-                value_content = normalize_quotes_to_json(value_content);
+                value_content = normalize_container_value(value_content);
            }

            // Try to parse as JSON value (number, bool, null, object, array)
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@ -17,7 +17,9 @@ class common_chat_peg_mapper {

    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
    virtual void map(const common_peg_ast_node & node);
-    private:
+  protected:
+    virtual std::string normalize_container_value(const std::string & input);
+  private:
      // Tool call handling state
      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
      common_chat_tool_call *              current_tool          = nullptr;
@ -30,6 +32,13 @@ class common_chat_peg_mapper {
      std::string & args_target();
 };

+class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
+  public:
+    common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+  protected:
+    std::string normalize_container_value(const std::string & input) override;
+};
+
 struct content_structure;
 struct tool_call_structure;

--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -13,6 +13,8 @@
 #include "jinja/caps.h"
 #include "peg-parser.h"

+#include "nlohmann/json.hpp"
+
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
@ -694,6 +696,8 @@ const char * common_chat_format_name(common_chat_format format) {
            return "peg-simple";
        case COMMON_CHAT_FORMAT_PEG_NATIVE:
            return "peg-native";
+        case COMMON_CHAT_FORMAT_PEG_GEMMA4:
+            return "peg-gemma4";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@ -760,12 +764,12 @@ static void foreach_parameter(const json &
    }
 }

-std::string common_chat_template_direct_apply(
+static std::string common_chat_template_direct_apply_impl(
    const common_chat_template & tmpl,
    const autoparser::generation_params & inputs,
-    const std::optional<json> & messages_override,
-    const std::optional<json> & tools_override,
-    const std::optional<json> & additional_context) {
+    const std::optional<json> & messages_override = std::nullopt,
+    const std::optional<json> & tools_override = std::nullopt,
+    const std::optional<json> & additional_context = std::nullopt) {
    jinja::context ctx(tmpl.source());

    nlohmann::ordered_json inp = nlohmann::ordered_json{
@ -812,6 +816,12 @@ std::string common_chat_template_direct_apply(
    return result;
 }

+std::string common_chat_template_direct_apply(
+    const common_chat_template & tmpl,
+    const autoparser::generation_params & inputs) {
+    return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
+}
+
 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
                                                              const autoparser::generation_params & inputs) {
    common_chat_params data;
@ -862,7 +872,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
    data.supports_thinking  = true;
    data.thinking_start_tag = "[THINK]";
    data.thinking_end_tag   = "[/THINK]";
-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens  = {
        "[THINK]",
@ -945,7 +955,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        adjusted_messages.push_back(msg);
    }

-    auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    auto prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);

    // Check if we need to replace the return token with end token during
    // inference and without generation prompt. For more details see:
@ -1072,7 +1082,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                                                                   const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt           = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = {
        ">>>all",
@ -1166,7 +1176,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt             = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking  = true;
    data.preserved_tokens  = {
@ -1289,7 +1299,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
                                                       const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@ -1368,7 +1378,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
                                                         const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@ -1439,7 +1449,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(

    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = false;
    data.preserved_tokens  = {
@ -1722,9 +1732,9 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    }

    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply(tmpl, params);
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply(tmpl, params);
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
    params.generation_prompt     = diff.right;

@ -1758,7 +1768,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        common_chat_params data;
        auto params_copy               = params;
        params_copy.reasoning_format   = COMMON_REASONING_FORMAT_NONE;
-        data.prompt                    = common_chat_template_direct_apply(tmpl, params_copy);
+        data.prompt                    = common_chat_template_direct_apply_impl(tmpl, params_copy);
        data.format                    = COMMON_CHAT_FORMAT_PEG_NATIVE;
        data.generation_prompt         = params.generation_prompt;
        auto parser                    = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
@ -1905,8 +1915,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            // Try to extract any partial results from what was successfully parsed
            common_chat_msg msg;
            msg.role = "assistant";
-            auto mapper = common_chat_peg_mapper(msg);
-            mapper.from_ast(ctx.ast, result);
+            std::unique_ptr<common_chat_peg_mapper> mapper;
+            if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
+                mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
+            } else {
+                mapper = std::make_unique<common_chat_peg_mapper>(msg);
+            }
+            mapper->from_ast(ctx.ast, result);

            if (ctx.is_debug()) {
                fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
@ -1921,8 +1936,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
    common_chat_msg msg;
    msg.role = "assistant";

-    auto mapper = common_chat_peg_mapper(msg);
-    mapper.from_ast(ctx.ast, result);
+    std::unique_ptr<common_chat_peg_mapper> mapper;
+    if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
+        mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
+    } else {
+        mapper = std::make_unique<common_chat_peg_mapper>(msg);
+    }
+    mapper->from_ast(ctx.ast, result);

    if (ctx.is_debug()) {
        fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
--- a/common/chat.h
+++ b/common/chat.h
@ -3,12 +3,12 @@
 #pragma once

 #include "common.h"
-#include "jinja/parser.h"
-#include "nlohmann/json_fwd.hpp"
 #include "peg-parser.h"
+#include "jinja/parser.h"
 #include "jinja/runtime.h"
 #include "jinja/caps.h"
-#include "nlohmann/json.hpp"
+
+#include "nlohmann/json_fwd.hpp"

 #include <chrono>
 #include <functional>
@ -19,8 +19,6 @@
 using chat_template_caps = jinja::caps;
 using json = nlohmann::ordered_json;

-#include <nlohmann/json_fwd.hpp>
-
 struct common_chat_templates;

 namespace autoparser {
@ -75,41 +73,9 @@ struct common_chat_template {
    const std::string & bos_token() const { return bos_tok; }
    const std::string & eos_token() const { return eos_tok; }

-    // TODO: this is ugly, refactor it somehow
-    json add_system(const json & messages, const std::string & system_prompt) const {
-        GGML_ASSERT(messages.is_array());
-        auto msgs_copy = messages;
-        if (!caps.supports_system_role) {
-            if (msgs_copy.empty()) {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "user"},
-                    {"content", system_prompt}
-                });
-            } else {
-                auto & first_msg = msgs_copy[0];
-                if (!first_msg.contains("content")) {
-                    first_msg["content"] = "";
-                }
-                first_msg["content"] = system_prompt + "\n\n"
-                    + first_msg["content"].get<std::string>();
-            }
-        } else {
-            if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
-                msgs_copy.insert(msgs_copy.begin(), json{
-                    {"role", "system"},
-                    {"content", system_prompt}
-                });
-            } else if (msgs_copy[0].at("role") == "system") {
-                msgs_copy[0]["content"] = system_prompt;
-            }
-        }
-        return msgs_copy;
-    }
-
    chat_template_caps original_caps() const {
        return caps;
    }
-
 };

 struct common_chat_msg {
@ -184,6 +150,7 @@ enum common_chat_format {
    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
    COMMON_CHAT_FORMAT_PEG_NATIVE,
+    COMMON_CHAT_FORMAT_PEG_GEMMA4,

    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };
@ -256,8 +223,8 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
                                                     const std::string &        bos_token_override = "",
                                                     const std::string &        eos_token_override = "");

-bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
+bool        common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");

 struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
                                                      const struct common_chat_templates_inputs & inputs);
@ -274,9 +241,9 @@ std::string common_chat_format_example(const struct common_chat_templates *
                                       bool                                       use_jinja,
                                       const std::map<std::string, std::string> & chat_template_kwargs);

-const char *            common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
+const char *    common_chat_format_name(common_chat_format format);
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
+common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);

 // used by arg and server
 const char *            common_reasoning_format_name(common_reasoning_format format);
@ -302,7 +269,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs,
-    const std::optional<json> & messages_override = std::nullopt,
-    const std::optional<json> & tools_override = std::nullopt,
-    const std::optional<json> & additional_context = std::nullopt);
+    const autoparser::generation_params & inputs);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -7464,9 +7464,6 @@ class Gemma4Model(Gemma3Model):

        assert len(tokens) == vocab.vocab_size

-        # TODO @ngxson : there are some known (rare) issues with the tokenizer during development
-        # but I don't have time to dive into them right now;
-        # using a dedicated tokenizer name so that we can fix later without re-converting GGUF
        self.gguf_writer.add_tokenizer_model("gemma4")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@ -57,13 +57,14 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based

 ## Supported Operations

-The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
+The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-based matrix multiplication (MUL_MAT_ID)** operations. Other operations are handled by the standard CPU backend.

 | Operation    | Status  | Notes                                          |
 |:-------------|:-------:|:----------------------------------------------:|
 | MUL_MAT      | Support | Accelerated via ZenDNN LowOHA MatMul           |
+| MUL_MAT_ID   | Support | Accelerated via ZenDNN LowOHA MatMul (MoE)     |

-*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
+*Note:* Since MUL_MAT and MUL_MAT_ID are accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs and Mixture-of-Experts models).

 ## DataType Supports

@ -181,7 +182,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen

 ## Known Issues

- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
+- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
 - **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
 - **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.

@ -216,4 +217,4 @@ Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-t

 ## TODO

- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
+- Expand operation support beyond MUL_MAT and MUL_MAT_ID (attention operations, activations, etc.)
--- a/docs/ops.md
+++ b/docs/ops.md
@ -68,7 +68,7 @@ Legend:
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/ZenDNN.csv
+++ b/docs/ops/ZenDNN.csv
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1009,8 +1009,8 @@ public:
    bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);

    struct stored_graph {
-        ggml_context_ptr ctx_ptr;
-        ggml_cgraph *    graph;
+        std::vector<uint8_t>   buffer;
+        ggml_cgraph          * graph;
    };

 private:
@ -1518,10 +1518,12 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
    LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);

    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
-
+    if (stored_graphs[device].buffer.size() < buf_size) {
+        stored_graphs[device].buffer.resize(buf_size);
+    }
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ NULL,
+        /*.mem_buffer =*/ stored_graphs[device].buffer.data(),
        /*.no_alloc   =*/ true,
    };
    ggml_context_ptr ctx_ptr { ggml_init(params) };
@ -1551,7 +1553,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
    }
    ggml_status status = ggml_backend_graph_compute(backends[device], graph);
    GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
-    stored_graphs[device].ctx_ptr.swap(ctx_ptr);
    stored_graphs[device].graph = graph;
    return true;
 }
--- a/ggml/src/ggml-zendnn/CMakeLists.txt
+++ b/ggml/src/ggml-zendnn/CMakeLists.txt
@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
    ExternalProject_Add(
        zendnn
        GIT_REPOSITORY https://github.com/amd/ZenDNN.git
-        GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf    # ZenDNN-2026-WW08
+        GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e    # ZenDNN-2026-WW13
        PREFIX      ${ZENDNN_PREFIX}
        SOURCE_DIR  ${ZENDNN_SOURCE_DIR}
        BINARY_DIR  ${ZENDNN_BUILD_DIR}
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@ -190,6 +190,170 @@ static void ggml_zendnn_compute_forward_mul_mat(
    }
 }

+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+
+static void ggml_zendnn_compute_forward_mul_mat_id(
+    ggml_backend_zendnn_context * ctx,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];  // expert weights
+    const ggml_tensor * src1 = dst->src[1];  // inputs
+    const ggml_tensor * ids  = dst->src[2];  // expert ids
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // exit for no tokens to process
+    if (ne2 == 0 || ne11 == 0) {
+        return;
+    }
+
+    ggml_type         const vec_dot_type = src0->type;
+    ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne03 == 1);
+    GGML_ASSERT(ne13 == 1);
+    GGML_ASSERT(ne3  == 1);
+
+    // row groups
+    const int n_ids = ids->ne[0]; // n_expert_used
+    const int n_as  = ne02;       // n_experts
+
+    std::vector<int64_t> matrix_row_counts(n_as, 0);
+    std::vector<std::vector<mmid_row_mapping>> matrix_rows(n_as);
+
+    int64_t max_rows = 0;
+    // group rows by expert (preprocessing step)
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+        for (int id = 0; id < n_ids; ++id) {
+            const int32_t i02 = *(const int32_t *)((const char *)ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            matrix_rows[i02].push_back({id, iid1});
+            matrix_row_counts[i02]++;
+            if (matrix_row_counts[i02] > max_rows) {
+                max_rows = matrix_row_counts[i02];
+            }
+        }
+    }
+
+    if (max_rows == 0) {
+        return; // no rows to process
+    }
+
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    // size for converting src1 rows to vec_dot_type if needed
+    const size_t nbw1 = row_size;
+    const size_t nbw2 = nbw1 * ne11;
+    const size_t nbw3 = nbw2 * ne12;
+    const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
+
+    // size for MoE gather/scatter buffers
+    const size_t wdata_cur_size = max_rows * row_size;
+    const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
+
+    // allocate single buffer for all needs
+    const size_t total_size = src1_conv_size + wdata_cur_size + dst_cur_size;
+    if (ctx->work_size < total_size) {
+        ctx->work_data.reset(new char[total_size]);
+        ctx->work_size = total_size;
+    }
+
+    // partition the buffer
+    char * work_data = ctx->work_data.get();
+    char * wdata_cur = work_data + src1_conv_size;
+    char * dst_cur = wdata_cur + wdata_cur_size;
+
+    if (src1->type != vec_dot_type) {
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        #pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    const float * src1_f32 = (float *)((char *)src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+                    void * src1_conv = (char *)work_data + i11*nbw1 + i12*nbw2 + i13*nbw3;
+                    from_float(src1_f32, src1_conv, ne10);
+                }
+            }
+        }
+    }
+
+    const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
+
+    // process each expert with gather -> gemm -> scatter pattern
+    for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int64_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
+
+        // gather input rows for this expert
+        #pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
+        for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
+            const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
+            const int64_t id = row_mapping.i1;
+            const int64_t i11 = id % ne11;
+            const int64_t i12 = row_mapping.i2;
+
+            std::memcpy(
+                wdata_cur + ir1 * row_size,
+                (const char *) wdata + (i11 + i12*ne11) * row_size,
+                row_size
+            );
+        }
+
+        // batched gemm for all tokens in this expert
+        if (!ggml_zendnn_sgemm(ctx,
+                              ne01,       // m
+                              cne1,       // n
+                              ne10,       // k
+                              src0_cur,
+                              ne00,       // lda
+                              wdata_cur,
+                              ne10,       // ldb
+                              dst_cur,
+                              ne01,       // ldc
+                              src0->type,
+                              vec_dot_type,
+                              dst->type)) {
+            GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
+        }
+
+        // scatter output rows to destination
+        #pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
+        for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
+            const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
+            const int64_t id = row_mapping.i1;
+            const int64_t i1 = id;
+            const int64_t i2 = row_mapping.i2;
+
+            std::memcpy(
+                (char *) dst->data + i1*nb1 + i2*nb2,
+                dst_cur + ir1 * ggml_row_size(dst->type, ne01),
+                ggml_row_size(dst->type, ne01)
+            );
+        }
+    }
+}
+
 // backend interface

 static const char * ggml_backend_zendnn_get_name(ggml_backend_t backend) {
@ -218,6 +382,9 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
            case GGML_OP_MUL_MAT:
                ggml_zendnn_compute_forward_mul_mat(ctx, node);
                break;
+            case GGML_OP_MUL_MAT_ID:
+                ggml_zendnn_compute_forward_mul_mat_id(ctx, node);
+                break;
            case GGML_OP_NONE:
            case GGML_OP_RESHAPE:
            case GGML_OP_VIEW:
@ -361,6 +528,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
            return true;

        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
        {
            const ggml_tensor * weights = op->src[0];
            const ggml_tensor * inputs = op->src[1];
@ -374,6 +542,17 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
                ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) {
                    return false;
            }
+            // MUL_MAT_ID performs best with a moderate number of experts due to its
+            // gather + batched matmul + scatter approach. Future versions will leverage
+            // ZenDNN's grouped_gemm for better scalability with larger expert counts:
+            // https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_group_gemm_operator.md
+            if (op->op == GGML_OP_MUL_MAT_ID) {
+                const int64_t n_experts = weights->ne[2];
+                const int64_t max_experts = 32;
+                if (n_experts > max_experts) {
+                    return false;
+                }
+            }
            switch (weights->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_BF16:
--- a/models/templates/gemma4.jinja
+++ b/models/templates/gemma4.jinja
@ -0,0 +1,266 @@
+{%- macro format_parameters(properties, required) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'OBJECT' -%}
+                ,properties:{
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                {%- elif value is mapping -%}
+                    {{- format_parameters(value, value['required'] | default([])) -}}
+                {%- endif -%}
+                }
+                {%- if value['required'] -%}
+                    ,required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    ,items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{ bos_token }}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {{- messages[0]['content'] | trim -}}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+        {{- '<|turn>' + role + '\n' }}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- if message['tool_responses'] -%}
+                {#- Tool Response handling -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- '<|tool_response>' -}}
+                    {%- if tool_response['response'] is mapping -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
+                        {%- for key, value in tool_response['response'] | dictsort -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                            {%- if not loop.last %},{% endif -%}
+                        {%- endfor -%}
+                        {{- '}' -}}
+                    {%- else -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
+                    {%- endif -%}
+                    {{- '<tool_response|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_response' -%}
+            {%- endif -%}
+
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '\n\n<|image|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '\n\n<|video|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+        {%- if not (message['tool_responses'] and not message['content']) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+    {%- if not enable_thinking | default(false) -%}
+        {{- '<|channel>thought\n<channel|>' -}}
+    {%- endif -%}
+{%- endif -%}
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(

    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);

-    // note: the SWA cache is never quantized because it is relatively small
    kv_swa = std::make_unique<llama_kv_cache>(
-            model, GGML_TYPE_F16, GGML_TYPE_F16,
+            model, type_k, type_v,
            v_trans, offload, unified, size_swa, n_seq_max, n_pad,
            hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
+                // Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
+                // normalizer, then BPE merges run on the whole text without
+                // word-level pre-splitting. We only need to split on newlines
+                // since BPE merge lookup asserts no newlines in tokens.
+                regex_exprs = {
+                    "[^\\n]+|[\\n]+",
+                };
+                byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
    }

    std::vector<std::string> regex_exprs;
+    bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
 };

 struct llm_tokenizer_bpe_session {
@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);

        symbols_final.clear();
+        auto tok_pre = vocab.get_pre_type();

        for (const auto & word : word_collection) {
            work_queue = llm_bigram_bpe::queue();
@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
            if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
                offset = word.size();
+            } else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
+                // fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
+                auto tok = vocab.text_to_token(word);
+                if (tok != LLAMA_TOKEN_NULL) {
+                    symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                    offset = word.size();
+                }
            }

            while (offset < word.size()) {
@ -1864,7 +1883,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_pad_id = 3;  // <|plamo:pad|>
            special_mask_id = LLAMA_TOKEN_NULL;
        } else if (tokenizer_model == "gemma4") {
-            type = LLAMA_VOCAB_TYPE_SPM;
+            type = LLAMA_VOCAB_TYPE_BPE;
+
+            // read bpe merges and populate bpe ranks
+            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
+            {
+                const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+                for (int i = 0; i < n_merges; i++) {
+                    const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+
+                    std::string first;
+                    std::string second;
+
+                    const size_t pos = word.find(' ', 1);
+
+                    if (pos != std::string::npos) {
+                        first  = word.substr(0, pos);
+                        second = word.substr(pos + 1);
+                    }
+
+                    bpe_ranks.emplace(std::make_pair(first, second), i);
+                }
+            }

            // default special tokens (to be read from GGUF)
            special_bos_id  = LLAMA_TOKEN_NULL;
@ -1874,7 +1917,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_pad_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;

-            tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_pre = "gemma4";
        } else {
            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
        }
@ -1882,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        // for now, only BPE models have pre-tokenizers
        if (type == LLAMA_VOCAB_TYPE_BPE) {
            add_space_prefix = false;
+            escape_whitespaces = false;
            clean_spaces = true;
            if (tokenizer_pre.empty()) {
                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@ -1948,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            } else if (
                    tokenizer_pre == "jais-2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
+            } else if (
+                    tokenizer_pre == "gemma4") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
+                escape_whitespaces = true;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-code" ||
@ -3045,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);

+                        if (escape_whitespaces) {
+                            llama_escape_whitespace(text);
+                        }
+
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
 #endif
@ -3224,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                    return _try_copy(token_text.data(), token_text.size());
                }
                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    if (escape_whitespaces) {
+                        // SPM-style BPE: tokens contain ▁ for spaces
+                        std::string result = token_text;
+                        llama_unescape_whitespace(result);
+                        return _try_copy(result.data(), result.size());
+                    }
                    std::string result = llama_decode_text(token_text);
                    return _try_copy(result.data(), result.size());
                }
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_TINY_AYA        = 47,
    LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
    LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
+    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
 };

 struct LLM_KV;
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
    return false;
 }

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", unicode_cpt_flags::NUMBER },
@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        start += offset;
    }

-    return unicode_byte_encoding_process(bpe_words);
+    if (byte_encode) {
+        return unicode_byte_encoding_process(bpe_words);
+    }
+
+    return bpe_words;
 }
--- a/src/unicode.h
+++ b/src/unicode.h
@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);

 bool unicode_cpt_is_han(uint32_t cpt);

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -589,6 +589,51 @@ static common_chat_tool amount_tool{
    })",
 };

+static common_chat_tool toggle_tool{
+    /* .name = */ "toggle",
+    /* .description = */ "Toggle a feature",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "enabled": {
+                "type": "boolean",
+                "description": "Whether to enable the feature"
+            }
+        },
+        "required": ["enabled"]
+    })",
+};
+
+static common_chat_tool nullable_tool{
+    /* .name = */ "set_nullable",
+    /* .description = */ "Set a nullable value",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "value": {
+                "type": "null",
+                "description": "A null value"
+            }
+        },
+        "required": ["value"]
+    })",
+};
+
+static common_chat_tool config_tool{
+    /* .name = */ "set_config",
+    /* .description = */ "Set configuration",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "config": {
+                "type": "object",
+                "description": "Configuration dict"
+            }
+        },
+        "required": ["config"]
+    })",
+};
+
 static common_chat_tool imaginary_number_tool{
    /* .name = */ "imaginary_number",
    /* .description = */ "Imaginary number converter",
@ -1869,6 +1914,130 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
        tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run();
    }

+    {
+        // Google Gemma 4 (tool calling with Gemma4 dict format)
+        auto tst = peg_tester("models/templates/gemma4.jinja");
+
+        tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
+
+        // Simple tool call with string argument
+        tst.test(
+                "<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>")
+            .tools({ get_time_tool })
+            .expect(message_with_tool_calls("get_time", R"({"city": "London"})"))
+            .run();
+
+        // Tool call with string argument containing special chars
+        tst.test(
+                "<|tool_call>call:get_time{city:<|\"|>San Francisco<|\"|>}<tool_call|>")
+            .tools({ get_time_tool })
+            .expect(message_with_tool_calls("get_time", R"({"city": "San Francisco"})"))
+            .run();
+
+        // Tool call with empty args
+        tst.test(
+                "<|tool_call>call:empty_args{}<tool_call|>")
+            .tools({ empty_args_tool })
+            .expect(message_with_tool_calls("empty_args", "{}"))
+            .run();
+
+        // Tool call with string and content
+        tst.test(
+                "Hello, world!\nWhat's up?<|tool_call>call:get_time{city:<|\"|>Paris<|\"|>}<tool_call|>")
+            .tools({ get_time_tool })
+            .expect(message_with_content_and_tool_call("Hello, world!\nWhat's up?", "get_time", R"({"city": "Paris"})"))
+            .run();
+
+        // Parallel tool calls
+        tst.test(
+                "<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>"
+                "<|tool_call>call:get_weather{city:<|\"|>Paris<|\"|>}<tool_call|>")
+            .tools({ get_time_tool, get_weather_tool })
+            .parallel_tool_calls(true)
+            .expect_tool_calls({
+                { "get_time", R"({"city": "London"})", "" },
+                { "get_weather", R"({"city": "Paris"})", "" },
+            })
+            .run();
+
+        // Tool call with integer argument (number type)
+        tst.test(
+                "<|tool_call>call:special_function{arg1:42}<tool_call|>")
+            .tools({ special_function_tool })
+            .expect(message_with_tool_calls("special_function", R"({"arg1": 42})"))
+            .run();
+
+        // Tool call with negative number argument
+        tst.test(
+                "<|tool_call>call:special_function{arg1:-7}<tool_call|>")
+            .tools({ special_function_tool })
+            .expect(message_with_tool_calls("special_function", R"({"arg1": -7})"))
+            .run();
+
+        // Tool call with decimal number argument
+        tst.test(
+                "<|tool_call>call:amount{orig:3.14}<tool_call|>")
+            .tools({ amount_tool })
+            .expect(message_with_tool_calls("amount", R"({"orig": 3.14})"))
+            .run();
+
+        // Tool call with boolean argument (true)
+        tst.test(
+                "<|tool_call>call:toggle{enabled:true}<tool_call|>")
+            .tools({ toggle_tool })
+            .expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
+            .run();
+
+        // Tool call with boolean argument (false)
+        tst.test(
+                "<|tool_call>call:toggle{enabled:false}<tool_call|>")
+            .tools({ toggle_tool })
+            .expect(message_with_tool_calls("toggle", R"({"enabled": false})"))
+            .run();
+
+        // Tool call with null argument
+        tst.test(
+                "<|tool_call>call:set_nullable{value:null}<tool_call|>")
+            .tools({ nullable_tool })
+            .expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
+            .run();
+
+        // Tool call with array argument (todo list)
+        tst.test(
+                "<|tool_call>call:todo_list{todos:[<|\"|>buy milk<|\"|>,<|\"|>walk dog<|\"|>]}<tool_call|>")
+            .tools({ todo_list })
+            .expect(message_with_tool_calls("todo_list", R"({"todos":["buy milk","walk dog"]})"))
+            .run();
+
+        // Tool call with object/dict argument
+        tst.test(
+                "<|tool_call>call:set_config{config:{theme:<|\"|>dark<|\"|>,count:3}}<tool_call|>")
+            .tools({ config_tool })
+            .expect(message_with_tool_calls("set_config", R"({"config":{"theme":"dark","count":3}})"))
+            .run();
+
+        // Tool call with empty array
+        tst.test(
+                "<|tool_call>call:todo_list{todos:[]}<tool_call|>")
+            .tools({ todo_list })
+            .expect(message_with_tool_calls("todo_list", R"({"todos":[]})"))
+            .run();
+
+        // Tool call with empty dict
+        tst.test(
+                "<|tool_call>call:set_config{config:{}}<tool_call|>")
+            .tools({ config_tool })
+            .expect(message_with_tool_calls("set_config", R"({"config":{}})"))
+            .run();
+
+        // Tool call with scientific notation number
+        tst.test(
+                "<|tool_call>call:amount{orig:1.5e10}<tool_call|>")
+            .tools({ amount_tool })
+            .expect(message_with_tool_calls("amount", R"({"orig": 1.5e10})"))
+            .run();
+    }
+
    {
        // Qwen-QwQ-32B (reasoning model)
        auto tst = peg_tester("models/templates/Qwen-QwQ-32B.jinja");
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -155,8 +155,8 @@ struct server_slot {
    int64_t t_start_process_prompt;
    int64_t t_start_generation;

-    double t_prompt_processing; // ms
-    double t_token_generation;  // ms
+    double t_prompt_processing = 0.0; // ms
+    double t_token_generation = 0.0;  // ms

    std::function<void(int /* id_slot */)> callback_on_release;

--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -261,14 +261,14 @@ struct result_timings {
    int32_t cache_n = -1;

    int32_t prompt_n = -1;
-    double prompt_ms;
-    double prompt_per_token_ms;
-    double prompt_per_second;
+    double prompt_ms = 0.0;
+    double prompt_per_token_ms = 0.0;
+    double prompt_per_second = 0.0;

    int32_t predicted_n = -1;
-    double predicted_ms;
-    double predicted_per_token_ms;
-    double predicted_per_second;
+    double predicted_ms = 0.0;
+    double predicted_per_token_ms = 0.0;
+    double predicted_per_second = 0.0;

    // Optional speculative metrics - only included when > 0
    int32_t draft_n = 0;
Author	SHA1	Message	Date
Dan Hoffman	f9d6cd646d	Merge `9f7ce433aa` into `43a4ee4a2c`	2026-04-03 13:16:36 +02:00
uvos	43a4ee4a2c	HIP: build eatch ci build test for a different architecture (#21337 ) This helps improve our chances of finding build failures before the release workflow builds for all architectures.	2026-04-03 11:38:22 +02:00
Tillerino	f851fa5ab0	fix: add openssl to nix dependencies (#21353 ) (#21355 )	2026-04-03 12:21:07 +03:00
Vishal Singh	f1ac84119c	ggml-zendnn : add MUL_MAT_ID op support for MoE models (#21315 ) * ggml-zendnn : add MUL_MAT_ID op support for MoE models - Add MUL_MAT_ID op acceleration for Mixture-of-Experts models - MUL_MAT_ID op fallback to CPU backend if total experts > 32 - Point ZenDNN lib to latest bits ZenDNN-2026-WW13 * ggml-zendnn : add braces to sgemm failure condition for consistency Co-authored-by: Aaron Teo <taronaeo@gmail.com> --------- Co-authored-by: Aaron Teo <taronaeo@gmail.com>	2026-04-03 12:19:08 +03:00
Piotr Wilkin (ilintar)	b069b10ab4	vocab: fix Gemma4 tokenizer (#21343 ) * seems to work * fix case with new line Co-authored-by: sayap <sokann@gmail.com> * gemma 4: fix pre tok regex --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: sayap <sokann@gmail.com>	2026-04-03 10:33:03 +02:00
Radoslav Gerganov	0c58ba3365	rpc : reuse compute graph buffers (#21299 ) Reuse the buffer for the ggml context which is used for creating the compute graph on the server side. This partially addresses a memory leak created by the CUDA backend due to using buffer addresses as cache keys. ref: #21265 ref: #20315	2026-04-03 10:28:09 +03:00
Georgi Gerganov	57ace0d612	chat : avoid including json in chat.h (#21306 )	2026-04-03 09:07:59 +03:00
Georgi Gerganov	39b27f0da0	(revert) kv-cache : do not quantize SWA KV cache (#21332 ) This reverts commit `17193cce34`.	2026-04-03 09:07:01 +03:00
Vishal Singh	f49e917876	ci : add AMD ZenDNN label to PR labeler (#21345 ) * ci : add AMD CPU label to PR labeler Add automatic labeling for PRs that modify AMD CPU (ZenDNN) backend files * ci : rename label AMD CPU to AMD ZenDNN in labeler config Co-authored-by: Aaron Teo <taronaeo@gmail.com> --------- Co-authored-by: Aaron Teo <taronaeo@gmail.com>	2026-04-03 10:35:15 +08:00
Slobodan Josic	7c7d6ce5c7	[HIP] Bump ROCm version to 7.2.1 (#21066 ) Bump ROCm version on Linux from 7.2 to 7.2.1 Add gfx1102 target Delete LLVM workaround since ROCm 7.2.1 has fix for ROCm 7.2 perf regression https://github.com/ROCm/rocm-systems/issues/2865 --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-03 00:59:20 +02:00
Piotr Wilkin (ilintar)	5208e2d5ba	fix: gemma 4 template (#21326 )	2026-04-02 23:31:02 +02:00
Dan Hoffman	9f7ce433aa	Fix undefined timing measurement errors in server context	2026-03-30 21:09:31 -07:00