model: try to improve Qwen3 Next (#18683 )

* qwen3next: simplify qkvz projection * use ggml_swiglu_split * revert swiglu_split, but remove redundant repeat() * fix missing reshape * rm 2 redundant transposes * move mul_mat(k,q) to outside of chunking * rm redundant cont * improve g_cs_chunk * add comments about no cont * use std::pair instead of ggml_concat * vectorize key_gdiff calculation * rm unused tensor * avoid ggml_concat inside loop * bring back ggml_concat as it may not work on other backend * nits
readme : update UIs (#18751 )
2026-01-11 12:53:33 +01:00 · 2026-01-11 13:46:50 +02:00 · 2026-01-11 12:23:36 +01:00 · 2026-01-10 21:57:44 -08:00 · 2026-01-11 01:12:57 +08:00 · 2026-01-10 17:19:01 +01:00
109 changed files with 17755 additions and 9093 deletions
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -33,6 +33,7 @@ FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -152,13 +152,13 @@ jobs:
          DAWN_VERSION="v2.0.0"
          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          curl -L -o artifact.zip \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
          unzip artifact.zip
-          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@ -532,13 +532,13 @@ jobs:
          DAWN_VERSION="v2.0.0"
          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          curl -L -o artifact.zip \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
          unzip artifact.zip
-          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@ -1418,7 +1418,6 @@ jobs:
          echo "FIXME: test on devices"

  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
    defaults:
      run:
        shell: bash -el {0}
@ -1705,6 +1704,34 @@ jobs:
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

+  ggml-ci-mac-webgpu:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          mkdir dawn
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
  ggml-ci-mac-vulkan:
    runs-on: [self-hosted, macOS, ARM64]

--- a/.gitignore
+++ b/.gitignore
@ -130,6 +130,7 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
+/run-spec.sh
 /.ccache/

 # IDE
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -182,6 +182,9 @@ if (NOT MSVC)
    endif()
 endif()

+include("cmake/license.cmake")
+license_add_file("llama.cpp" "LICENSE")
+
 #
 # 3rd-party
 #
@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()

+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+
+foreach(FILE_PATH ${EXTRA_LICENSES})
+    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+    license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+
+if (LLAMA_BUILD_COMMON)
+    license_generate(common)
+endif()
+
 #
 # install
 #
--- a/README.md
+++ b/README.md
@ -200,6 +200,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
@ -482,21 +483,6 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

-## [`llama-run`](tools/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
- <details>
-    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
-
-    ```bash
-    llama-run granite-code
-    ```
-
-    </details>
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)
-
 ## [`llama-simple`](examples/simple)

 #### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
@ -600,7 +586,6 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
 - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
 - [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,12 +1,48 @@
 # Security Policy

+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+ - [**Requirements**](#requirements)
+ - [**Covered Topics**](#covered-topics)
 - [**Using llama.cpp securely**](#using-llamacpp-securely)
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Untrusted environments or networks](#untrusted-environments-or-networks)
   - [Multi-Tenant environments](#multi-tenant-environments)
- - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+
+## Reporting a vulnerability
+
+If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
+
+> [!IMPORTANT]
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
+
+## Requirements
+
+Before submitting your report, ensure you meet the following requirements:
+
+- You have read this policy and fully understand it.
+- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
+- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
+
+Maintainers reserve the right to close the report if these requirements are not fulfilled.
+
+## Covered Topics
+
+Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
+
+- `src/**/*`
+- `ggml/**/*`
+- `gguf-py/**/*`
+- `tools/server/*` (note: Web UI is not covered)
+
+Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
+
+For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.

 ## Using llama.cpp securely

@ -55,19 +91,3 @@ If you intend to run multiple models in parallel with shared memory, it is your
 3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.

 4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
-
-## Reporting a vulnerability
-
-Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
-
-<!-- normal version -->
-However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
-
-Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
-
-A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
-
-> [!IMPORTANT]
-> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
--- a/ci/run.sh
+++ b/ci/run.sh
@ -105,7 +105,20 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+
+    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
+        if [ -z "${CMAKE_PREFIX_PATH}" ]; then
+            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
+        else
+            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
+        fi
+    fi
+
+    # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
+    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
+    fi
 fi

 if [ ! -z ${GG_BUILD_MUSA} ]; then
--- a/cmake/license.cmake
+++ b/cmake/license.cmake
@ -0,0 +1,40 @@
+define_property(GLOBAL PROPERTY LICENSE_TEXT
+    BRIEF_DOCS "Embedded licenses"
+    FULL_DOCS  "Global string containing all aggregated licenses"
+)
+
+function(license_add_file NAME FILE)
+    if(NOT IS_ABSOLUTE "${FILE}")
+        set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
+    endif()
+    if(EXISTS "${FILE}")
+        set(TITLE "License for ${NAME}")
+        string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
+        file(READ "${FILE}" TEXT)
+        get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
+        string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
+        set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
+    else()
+        message(WARNING "License file '${FILE}' not found")
+    endif()
+endfunction()
+
+function(license_generate TARGET_NAME)
+    message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
+    get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
+
+    set(CPP_CONTENT "// Generated by CMake\n\n")
+    string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
+    string(APPEND CPP_CONTENT "${TEXT}")
+    string(APPEND CPP_CONTENT "nullptr\n")
+    string(APPEND CPP_CONTENT "};\n")
+
+    set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
+    file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
+
+    if(TARGET ${TARGET_NAME})
+        target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
+    else()
+        message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
+    endif()
+endfunction()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -155,27 +155,3 @@ if (LLAMA_LLGUIDANCE)
 endif ()

 target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
-
-
-#
-# copy the license files
-#
-
-# Check if running in GitHub Actions
-if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
-    message(STATUS "Running inside GitHub Actions - copying license files")
-
-    # Copy all files from licenses/ to build/bin/
-    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
-    foreach(LICENSE_FILE ${LICENSE_FILES})
-        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
-        add_custom_command(
-            POST_BUILD
-            TARGET ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                "${LICENSE_FILE}"
-                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
-            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
-    endforeach()
-endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2,10 +2,11 @@

 #include "chat.h"
 #include "common.h"
+#include "download.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "download.h"
+#include "preset.h"

 // fix problem with std::min and std::max
 #if defined(_WIN32)
@ -47,6 +48,8 @@

 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

+extern const char * LICENSES[];
+
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;

@ -268,6 +271,55 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
    }
 }

+static std::string clean_file_name(const std::string & fname) {
+    std::string clean_fname = fname;
+    string_replace_all(clean_fname, "\\", "_");
+    string_replace_all(clean_fname, "/", "_");
+    return clean_fname;
+}
+
+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+    GGML_ASSERT(!params.model.hf_repo.empty());
+
+    // the returned hf_repo is without tag
+    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+
+    // "latest" tag (default if not specified) is translated to "default" preset
+    if (hf_tag == "latest") {
+        hf_tag = "default";
+    }
+
+    const bool offline = params.offline;
+    std::string model_endpoint = get_model_endpoint();
+    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+
+    // prepare local path for caching
+    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+    auto preset_path = fs_get_cache_file(preset_fname);
+    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+    const bool has_preset = status >= 200 && status < 400;
+
+    // remote preset is optional, so we don't error out if not found
+    if (has_preset) {
+        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        common_preset_context ctx(ex, /* only_remote_allowed */ true);
+        common_preset global;
+        auto remote_presets = ctx.load_from_ini(preset_path, global);
+        remote_presets = ctx.cascade(global, remote_presets);
+        if (remote_presets.find(hf_tag) != remote_presets.end()) {
+            common_preset preset = remote_presets.at(hf_tag);
+            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+            preset.apply_to_params(params);
+        } else {
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+        }
+    } else {
+        LOG_INF("%s", "no remote preset found, skipping\n");
+    }
+
+    return has_preset;
+}
+
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
@ -309,9 +361,7 @@ static handle_model_result common_params_handle_model(
            // make sure model path is present (for caching purposes)
            if (model.path.empty()) {
                // this is to avoid different repo having same file name, or same file name in different subdirs
-                std::string filename = model.hf_repo + "_" + model.hf_file;
-                // to make sure we don't have any slashes in the filename
-                string_replace_all(filename, "/", "_");
+                std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
                model.path = fs_get_cache_file(filename);
            }

@ -425,61 +475,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        }
    };

-    std::set<std::string> seen_args;
+    auto parse_cli_args = [&]() {
+        std::set<std::string> seen_args;

-    for (int i = 1; i < argc; i++) {
-        const std::string arg_prefix = "--";
+        for (int i = 1; i < argc; i++) {
+            const std::string arg_prefix = "--";

-        std::string arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
-        }
-        if (!seen_args.insert(arg).second) {
-            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
-        }
-        auto & tmp = arg_to_options[arg];
-        auto opt = *tmp.first;
-        bool is_positive = tmp.second;
-        if (opt.has_value_from_env()) {
-            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
-        }
-        try {
-            if (opt.handler_void) {
-                opt.handler_void(params);
-                continue;
+            std::string arg = argv[i];
+            if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+                std::replace(arg.begin(), arg.end(), '_', '-');
            }
-            if (opt.handler_bool) {
-                opt.handler_bool(params, is_positive);
-                continue;
+            if (arg_to_options.find(arg) == arg_to_options.end()) {
+                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
            }
+            if (!seen_args.insert(arg).second) {
+                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+            }
+            auto & tmp = arg_to_options[arg];
+            auto opt = *tmp.first;
+            bool is_positive = tmp.second;
+            if (opt.has_value_from_env()) {
+                fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+            }
+            try {
+                if (opt.handler_void) {
+                    opt.handler_void(params);
+                    continue;
+                }
+                if (opt.handler_bool) {
+                    opt.handler_bool(params, is_positive);
+                    continue;
+                }

-            // arg with single value
-            check_arg(i);
-            std::string val = argv[++i];
-            if (opt.handler_int) {
-                opt.handler_int(params, std::stoi(val));
-                continue;
-            }
-            if (opt.handler_string) {
-                opt.handler_string(params, val);
-                continue;
-            }
+                // arg with single value
+                check_arg(i);
+                std::string val = argv[++i];
+                if (opt.handler_int) {
+                    opt.handler_int(params, std::stoi(val));
+                    continue;
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(params, val);
+                    continue;
+                }

-            // arg with 2 values
-            check_arg(i);
-            std::string val2 = argv[++i];
-            if (opt.handler_str_str) {
-                opt.handler_str_str(params, val, val2);
-                continue;
+                // arg with 2 values
+                check_arg(i);
+                std::string val2 = argv[++i];
+                if (opt.handler_str_str) {
+                    opt.handler_str_str(params, val, val2);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(string_format(
+                    "error while handling argument \"%s\": %s\n\n"
+                    "usage:\n%s\n\nto show complete usage, run with -h",
+                    arg.c_str(), e.what(), opt.to_string().c_str()));
            }
-        } catch (std::exception & e) {
-            throw std::invalid_argument(string_format(
-                "error while handling argument \"%s\": %s\n\n"
-                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), opt.to_string().c_str()));
+        }
+    };
+
+    // parse the first time to get -hf option (used for remote preset)
+    parse_cli_args();
+
+    // maybe handle remote preset
+    if (!params.model.hf_repo.empty()) {
+        std::string cli_hf_repo = params.model.hf_repo;
+        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+        std::string preset_hf_repo = params.model.hf_repo;
+        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+        if (has_preset) {
+            // re-parse CLI args to override preset values
+            parse_cli_args();
+        }
+
+        // preserve hf_repo from preset if needed
+        if (preset_has_hf_repo) {
+            params.model.hf_repo = preset_hf_repo;
        }
    }

@ -679,7 +755,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-quantize",
        "llama-qwen2vl-cli",
        "llama-retrieval",
-        "llama-run",
        "llama-save-load-state",
        "llama-server",
        "llama-simple",
@ -966,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
+    add_opt(common_arg(
+        {"--license"},
+        "show source code license and dependencies",
+        [](common_params &) {
+            for (int i = 0; LICENSES[i]; ++i) {
+                printf("%s\n", LICENSES[i]);
+            }
+            exit(0);
+        }
+    ));
    add_opt(common_arg(
        {"-cl", "--cache-list"},
        "show list of models in cache",
@ -2089,11 +2174,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--mmap"},
        {"--no-mmap"},
-        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.use_mmap = value;
+            if (value) {
+                params.use_direct_io = false;  // disable direct io when mmap is explicitly enabled
+            }
        }
    ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"-dio", "--direct-io"},
+        {"-ndio", "--no-direct-io"},
+        string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_direct_io = value;
+        }
+    ).set_env("LLAMA_ARG_DIO"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@ -2245,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            std::vector<std::string> split_arg{ it, {} };
            if (split_arg.size() >= llama_max_devices()) {
                throw std::invalid_argument(
-                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
                );
            }
            for (size_t i = 0; i < llama_max_devices(); ++i) {
@ -2285,10 +2381,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_FIT"));
    add_opt(common_arg(
-        { "-fitt", "--fit-target" }, "MiB",
-        string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
-        [](common_params & params, int value) {
-            params.fit_params_target = value * size_t(1024*1024);
+        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
+        string_format("target margin per device for --fit, comma-separated list of values, "
+            "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
+        [](common_params & params, const std::string & value) {
+            std::string arg_next = value;
+
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+                );
+            }
+            if (split_arg.size() == 1) {
+                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+                return;
+            }
+            for (size_t i = 0; i < split_arg.size(); i++) {
+                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+            }
        }
    ).set_env("LLAMA_ARG_FIT_TARGET"));
    add_opt(common_arg(
--- a/common/arg.h
+++ b/common/arg.h
@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-struct common_remote_params {
-    std::vector<std::string> headers;
-    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
-    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns <http_code, raw_response_body>
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
    if (params.fit_params) {
        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

@ -1366,6 +1366,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
+    mparams.use_direct_io   = params.use_direct_io;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
--- a/common/common.h
+++ b/common/common.h
@ -332,12 +332,14 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
-    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
-    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+
+    // margin per device in bytes for fitting parameters to free memory:
+    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@ -428,7 +430,8 @@ struct common_params {
    bool kv_unified        = false; // enable unified KV cache

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mmap          = true;  // enable mmap to use filesystem cache
+    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
--- a/common/download.cpp
+++ b/common/download.cpp
@ -157,6 +157,20 @@ static std::string read_etag(const std::string & path) {
    return none;
 }

+static bool is_http_status_ok(int status) {
+    return status >= 200 && status < 400;
+}
+
+std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+    return {hf_repo, tag};
+}
+
 #ifdef LLAMA_USE_CURL

 //
@ -306,11 +320,14 @@ static bool common_download_head(CURL *              curl,
 }

 // download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
                                               const std::string & path,
-                                               const std::string & bearer_token) {
+                                               const std::string & bearer_token,
+                                               const common_header_list & custom_headers) {
    static const int max_attempts        = 3;
    static const int retry_delay_seconds = 2;
+
    for (int i = 0; i < max_attempts; ++i) {
        std::string etag;

@ -330,6 +347,11 @@ static bool common_download_file_single_online(const std::string & url,
        common_load_model_from_url_headers headers;
        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
        curl_slist_ptr http_headers;
+
+        for (const auto & h : custom_headers) {
+             std::string s = h.first + ": " + h.second;
+             http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
+        }
        const bool     was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
        if (!was_perform_successful) {
            head_request_ok = false;
@ -365,7 +387,7 @@ static bool common_download_file_single_online(const std::string & url,
                LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
                if (remove(path.c_str()) != 0) {
                    LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                    return false;
+                    return -1;
                }
            }

@ -374,14 +396,14 @@ static bool common_download_file_single_online(const std::string & url,
                if (std::filesystem::exists(path_temporary)) {
                    if (remove(path_temporary.c_str()) != 0) {
                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                        return false;
+                        return -1;
                    }
                }

                if (std::filesystem::exists(path)) {
                    if (remove(path.c_str()) != 0) {
                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                        return false;
+                        return -1;
                    }
                }
            }
@ -408,23 +430,27 @@ static bool common_download_file_single_online(const std::string & url,

            long http_code = 0;
            curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-            if (http_code < 200 || http_code >= 400) {
+
+            int status = static_cast<int>(http_code);
+            if (!is_http_status_ok(http_code)) {
                LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-                return false;
+                return status; // TODO: maybe only return on certain codes
            }

            if (rename(path_temporary.c_str(), path.c_str()) != 0) {
                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                return false;
+                return -1;
            }
+
+            return static_cast<int>(http_code);
        } else {
            LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-        }

-        break;
+            return 304; // Not Modified - fake cached response
+        }
    }

-    return true;
+    return -1; // max attempts reached
 }

 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
@ -454,8 +480,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
    }
    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+
    for (const auto & header : params.headers) {
-        http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
+        std::string header_ = header.first + ": " + header.second;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
    }
    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);

@ -617,9 +645,11 @@ static bool common_pull_file(httplib::Client & cli,
 }

 // download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
                                               const std::string & path,
-                                               const std::string & bearer_token) {
+                                               const std::string & bearer_token,
+                                               const common_header_list & custom_headers) {
    static const int max_attempts        = 3;
    static const int retry_delay_seconds = 2;

@ -629,6 +659,9 @@ static bool common_download_file_single_online(const std::string & url,
    if (!bearer_token.empty()) {
        default_headers.insert({"Authorization", "Bearer " + bearer_token});
    }
+    for (const auto & h : custom_headers) {
+        default_headers.emplace(h.first, h.second);
+    }
    cli.set_default_headers(default_headers);

    const bool file_exists = std::filesystem::exists(path);
@ -647,8 +680,10 @@ static bool common_download_file_single_online(const std::string & url,
            LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
            if (file_exists) {
                LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
-                return true;
+                return 304; // 304 Not Modified - fake cached response
            }
+            return head->status; // cannot use cached file, return raw status code
+            // TODO: maybe retry only on certain codes
        }

        std::string etag;
@ -680,12 +715,12 @@ static bool common_download_file_single_online(const std::string & url,
        if (file_exists) {
            if (!should_download_from_scratch) {
                LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-                return true;
+                return 304; // 304 Not Modified - fake cached response
            }
            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
            if (remove(path.c_str()) != 0) {
                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
+                return -1;
            }
        }

@ -697,7 +732,7 @@ static bool common_download_file_single_online(const std::string & url,
                existing_size = std::filesystem::file_size(path_temporary);
            } else if (remove(path_temporary.c_str()) != 0) {
                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                return false;
+                return -1;
            }
        }

@ -718,15 +753,16 @@ static bool common_download_file_single_online(const std::string & url,

        if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
+            return -1;
        }
        if (!etag.empty()) {
            write_etag(path, etag);
        }
-        break;
+
+        return head->status; // TODO: use actual GET status?
    }

-    return true;
+    return -1; // max attempts reached
 }

 std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
@ -734,13 +770,9 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
    auto [cli, parts] = common_http_client(url);

    httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
+
    for (const auto & header : params.headers) {
-        size_t pos = header.find(':');
-        if (pos != std::string::npos) {
-            headers.emplace(header.substr(0, pos), header.substr(pos + 1));
-        } else {
-            headers.emplace(header, "");
-        }
+        headers.emplace(header.first, header.second);
    }

    if (params.timeout > 0) {
@ -769,32 +801,45 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string

 #if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)

-static bool common_download_file_single(const std::string & url,
-                                        const std::string & path,
-                                        const std::string & bearer_token,
-                                        bool                offline) {
+int common_download_file_single(const std::string & url,
+                                const std::string & path,
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers) {
    if (!offline) {
-        return common_download_file_single_online(url, path, bearer_token);
+        return common_download_file_single_online(url, path, bearer_token, headers);
    }

    if (!std::filesystem::exists(path)) {
        LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
-        return false;
+        return -1;
    }

    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-    return true;
+    return 304; // Not Modified - fake cached response
 }

 // download multiple files from remote URLs to local paths
 // the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
+                                          const std::string & bearer_token,
+                                          bool offline,
+                                          const common_header_list & headers) {
    // Prepare download in parallel
    std::vector<std::future<bool>> futures_download;
+    futures_download.reserve(urls.size());
+
    for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token, offline);
-        }, item));
+        futures_download.push_back(
+            std::async(
+                std::launch::async,
+                [&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
+                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                    return is_http_status_ok(http_status);
+                },
+                item
+            )
+        );
    }

    // Wait for all downloads to complete
@ -807,17 +852,18 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
    return true;
 }

-bool common_download_model(
-        const common_params_model & model,
-        const std::string & bearer_token,
-        bool offline) {
+bool common_download_model(const common_params_model & model,
+                           const std::string & bearer_token,
+                           bool offline,
+                           const common_header_list & headers) {
    // Basic validation of the model.url
    if (model.url.empty()) {
        LOG_ERR("%s: invalid model url\n", __func__);
        return false;
    }

-    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
+    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
+    if (!is_http_status_ok(http_status)) {
        return false;
    }

@ -876,27 +922,26 @@ bool common_download_model(
        }

        // Download in parallel
-        common_download_file_multiple(urls, bearer_token, offline);
+        common_download_file_multiple(urls, bearer_token, offline, headers);
    }

    return true;
 }

-common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
+common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
+                                      const std::string & bearer_token,
+                                      bool offline,
+                                      const common_header_list & custom_headers) {
+    // the returned hf_repo is without tag
+    auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);

    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;

    // headers
-    std::vector<std::string> headers;
-    headers.push_back("Accept: application/json");
+    common_header_list headers = custom_headers;
+    headers.push_back({"Accept", "application/json"});
    if (!bearer_token.empty()) {
-        headers.push_back("Authorization: Bearer " + bearer_token);
+        headers.push_back({"Authorization", "Bearer " + bearer_token});
    }
    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
    // User-Agent header is already set in common_remote_get_content, no need to set it here
@ -952,7 +997,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
    } else if (res_code == 401) {
        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+        throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
    }

    // check response
@ -1031,9 +1076,10 @@ std::string common_docker_resolve_model(const std::string & docker) {
        const std::string    url_prefix = "https://registry-1.docker.io/v2/" + repo;
        std::string          manifest_url = url_prefix + "/manifests/" + tag;
        common_remote_params manifest_params;
-        manifest_params.headers.push_back("Authorization: Bearer " + token);
-        manifest_params.headers.push_back(
-            "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
+        manifest_params.headers.push_back({"Authorization", "Bearer " + token});
+        manifest_params.headers.push_back({"Accept",
+            "application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
+        });
        auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
        if (manifest_res.first != 200) {
            throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
@ -1070,7 +1116,8 @@ std::string common_docker_resolve_model(const std::string & docker) {
        std::string local_path = fs_get_cache_file(model_filename);

        const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
-        if (!common_download_file_single(blob_url, local_path, token, false)) {
+        const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
+        if (!is_http_status_ok(http_status)) {
            throw std::runtime_error("Failed to download Docker Model");
        }

@ -1084,11 +1131,11 @@ std::string common_docker_resolve_model(const std::string & docker) {

 #else

-common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
+common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
    throw std::runtime_error("download functionality is not enabled in this build");
 }

-bool common_download_model(const common_params_model &, const std::string &, bool) {
+bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
    throw std::runtime_error("download functionality is not enabled in this build");
 }

@ -1096,6 +1143,14 @@ std::string common_docker_resolve_model(const std::string &) {
    throw std::runtime_error("download functionality is not enabled in this build");
 }

+int common_download_file_single(const std::string &,
+                                const std::string &,
+                                const std::string &,
+                                bool,
+                                const common_header_list &) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
 #endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB

 std::vector<common_cached_model_info> common_list_cached_models() {
--- a/common/download.h
+++ b/common/download.h
@ -1,12 +1,27 @@
 #pragma once

 #include <string>
+#include <vector>

 struct common_params_model;

-//
-// download functionalities
-//
+using common_header      = std::pair<std::string, std::string>;
+using common_header_list = std::vector<common_header>;
+
+struct common_remote_params {
+    common_header_list headers;
+    long timeout  = 0;           // in seconds, 0 means no timeout
+    long max_size = 0;           // unlimited if 0
+};
+
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
+
+// split HF repo with tag into <repo, tag>
+// for example: "user/model:tag" -> <"user/model", "tag">
+// if tag is not present, default to "latest"
+// example: "user/model" -> <"user/model", "latest">
+std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);

 struct common_cached_model_info {
    std::string manifest_path;
@ -41,17 +56,29 @@ struct common_hf_file_res {
 common_hf_file_res common_get_hf_file(
    const std::string & hf_repo_with_tag,
    const std::string & bearer_token,
-    bool offline);
+    bool offline,
+    const common_header_list & headers = {}
+);

 // returns true if download succeeded
 bool common_download_model(
    const common_params_model & model,
    const std::string & bearer_token,
-    bool offline);
+    bool offline,
+    const common_header_list & headers = {}
+);

 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();

+// download single file from url to local path
+// returns status code or -1 on error
+int common_download_file_single(const std::string & url,
+                                const std::string & path,
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers = {});
+
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@ -16,6 +16,48 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }

+// only allow a subset of args for remote presets for security reasons
+// do not add more args unless absolutely necessary
+// args that output to files are strictly prohibited
+static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
+    static const std::set<std::string> allowed_options = {
+        "model-url",
+        "hf-repo",
+        "hf-repo-draft",
+        "hf-repo-v", // vocoder
+        "hf-file-v", // vocoder
+        "mmproj-url",
+        "pooling",
+        "jinja",
+        "batch-size",
+        "ubatch-size",
+        "cache-reuse",
+        "chat-template-kwargs",
+        "mmap",
+        // note: sampling params are automatically allowed by default
+        // negated args will be added automatically if the positive arg is specified above
+    };
+
+    std::set<std::string> allowed_keys;
+
+    for (const auto & it : key_to_opt) {
+        const std::string & key = it.first;
+        const common_arg & opt = it.second;
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
+            allowed_keys.insert(key);
+            // also add variant keys (args without leading dashes and env vars)
+            for (const auto & arg : opt.get_args()) {
+                allowed_keys.insert(rm_leading_dashes(arg));
+            }
+            for (const auto & env : opt.get_env()) {
+                allowed_keys.insert(env);
+            }
+        }
+    }
+
+    return allowed_keys;
+}
+
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
    std::vector<std::string> args;

@ -121,6 +163,29 @@ void common_preset::merge(const common_preset & other) {
    }
 }

+void common_preset::apply_to_params(common_params & params) const {
+    for (const auto & [opt, val] : options) {
+        // apply each option to params
+        if (opt.handler_string) {
+            opt.handler_string(params, val);
+        } else if (opt.handler_int) {
+            opt.handler_int(params, std::stoi(val));
+        } else if (opt.handler_bool) {
+            opt.handler_bool(params, common_arg_utils::is_truthy(val));
+        } else if (opt.handler_str_str) {
+            // not supported yet
+            throw std::runtime_error(string_format(
+                "%s: option with two values is not supported yet",
+                __func__
+            ));
+        } else if (opt.handler_void) {
+            opt.handler_void(params);
+        } else {
+            GGML_ABORT("unknown handler type");
+        }
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
    std::map<std::string, std::map<std::string, std::string>> parsed;

@ -230,10 +295,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }

-common_preset_context::common_preset_context(llama_example ex)
+common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
        : ctx_params(common_params_parser_init(default_params, ex)) {
    common_params_add_preset_options(ctx_params.options);
    key_to_opt = get_map_key_opt(ctx_params);
+
+    // setup allowed keys if only_remote_allowed is true
+    if (only_remote_allowed) {
+        filter_allowed_keys = true;
+        allowed_keys = get_remote_preset_whitelist(key_to_opt);
+    }
 }

 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@ -249,7 +320,18 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
        }
        LOG_DBG("loading preset: %s\n", preset.name.c_str());
        for (const auto & [key, value] : section.second) {
+            if (key == "version") {
+                // skip version key (reserved for future use)
+                continue;
+            }
+
            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+            if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
+                throw std::runtime_error(string_format(
+                    "option '%s' is not allowed in remote presets",
+                    key.c_str()
+                ));
+            }
            if (key_to_opt.find(key) != key_to_opt.end()) {
                const auto & opt = key_to_opt.at(key);
                if (is_bool_arg(opt)) {
@ -259,7 +341,10 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
                }
                LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
            } else {
-                // TODO: maybe warn about unknown key?
+                throw std::runtime_error(string_format(
+                    "option '%s' not recognized in preset '%s'",
+                    key.c_str(), preset.name.c_str()
+                ));
            }
        }

--- a/common/preset.h
+++ b/common/preset.h
@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <set>

 //
 // INI preset parser and writer
@ -40,6 +41,9 @@ struct common_preset {

    // merge another preset into this one, overwriting existing options
    void merge(const common_preset & other);
+
+    // apply preset options to common_params
+    void apply_to_params(common_params & params) const;
 };

 // interface for multiple presets in one file
@ -50,7 +54,12 @@ struct common_preset_context {
    common_params default_params; // unused for now
    common_params_context ctx_params;
    std::map<std::string, common_arg> key_to_opt;
-    common_preset_context(llama_example ex);
+
+    bool filter_allowed_keys = false;
+    std::set<std::string> allowed_keys;
+
+    // if only_remote_allowed is true, only accept whitelisted keys
+    common_preset_context(llama_example ex, bool only_remote_allowed = false);

    // load presets from INI file
    common_presets load_from_ini(const std::string & path, common_preset & global) const;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -528,7 +528,11 @@ class ModelBase:
        return ()

    def prepare_tensors(self):
-        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
+        if self.tensor_map.mapping:
+            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+        else:
+            max_name_len = len("vision_encoder.weight,")  # Default reasonable length

        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
            # we don't need these
@ -771,8 +775,8 @@ class TextModel(ModelBase):

        self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}

-        rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
-        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
+        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)

        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
@ -4363,7 +4367,37 @@ class Qwen3NextModel(Qwen2MoeModel):
        elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
            data_torch = data_torch + 1

-        yield from super().modify_tensors(data_torch, name, bid)
+        if "in_proj_qkvz.weight" in name:
+            # original order:  [q, k, v, z] * head_count
+            # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
+            head_k_dim = self.hparams["linear_key_head_dim"]
+            head_v_dim = self.hparams["linear_value_head_dim"]
+            num_v_heads = self.hparams["linear_num_value_heads"]
+            num_k_heads = self.hparams["linear_num_key_heads"]
+            hidden_size = self.hparams["hidden_size"]
+            split_arg_list_qkvz = [
+                head_k_dim, # q partition
+                head_k_dim, # k partition
+                (num_v_heads // num_k_heads * head_v_dim), # v partition
+                (num_v_heads // num_k_heads * head_v_dim), # z partition
+            ]
+            # view as (n_embd, head_count, [q+k+v+z])
+            data_torch = data_torch.permute(1, 0).contiguous()
+            data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
+            # split into q, k, v, z
+            q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
+            # flatten dim + head_count
+            q = q.contiguous().view(hidden_size, -1)
+            k = k.contiguous().view(hidden_size, -1)
+            v = v.contiguous().view(hidden_size, -1)
+            z = z.contiguous().view(hidden_size, -1)
+            # stack back
+            qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
+            z = z.permute(1, 0).contiguous()
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV,  bid, ".weight"), qkv)
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
+        else:
+            yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("RND1")
@ -6038,7 +6072,175 @@ class Gemma3VisionModel(MmprojModel):
        return [] # skip other tensors


+class ConformerAudioModel(MmprojModel):
+    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def is_audio_tensor(name: str):
+        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ConformerAudioModel.is_audio_tensor(name):
+            if ".conv" in name or "_conv" in name and ".weight" in name:
+                return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # fold running_mean, running_var and eps into weight and bias for batch_norm
+        if "batch_norm" in name:
+            if self._batch_norm_tensors is None:
+                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+            assert bid is not None
+            self._batch_norm_tensors[bid][name] = data_torch
+
+            if len(self._batch_norm_tensors[bid]) < 5:
+                return []
+
+            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+            eps = 1e-5 # default value
+
+            a = weight / torch.sqrt(running_var + eps)
+            b = bias - running_mean * a
+            return [
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+            ]
+
+        # reshape conv weights
+        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+            data_torch = data_torch[:, None, None]
+        if "conv.depthwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[1] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+        if "conv.pointwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[2] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
@ModelBase.register("Gemma3nForConditionalGeneration")
+class Gemma3nVisionAudioModel(ConformerAudioModel):
+    has_audio_encoder = True
+    has_vision_encoder = True
+
+    # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
+    # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
+    block_tensor_mapping = {
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.blk.{bid}.{sid}.conv_exp.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.blk.{bid}.{sid}.bn1.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.blk.{bid}.{sid}.conv_pwl.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.blk.{bid}.{sid}.bn2.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.blk.{bid}.{sid}.dw_start.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.blk.{bid}.{sid}.dw_start.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.blk.{bid}.{sid}.dw_mid.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.blk.{bid}.{sid}.dw_mid.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.blk.{bid}.{sid}.pw_exp.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.blk.{bid}.{sid}.pw_exp.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.blk.{bid}.{sid}.pw_proj.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.blk.{bid}.{sid}.pw_proj.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.blk.{bid}.{sid}.layer_scale.gamma",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.blk.{bid}.{sid}.attn.query.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.blk.{bid}.{sid}.attn.key.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.blk.{bid}.{sid}.attn.value.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.blk.{bid}.{sid}.attn.output.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.blk.{bid}.{sid}.attn.key.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.blk.{bid}.{sid}.attn.value.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
+    }
+
+    def __init__(self, *args, **kwargs):
+        # Parent init will call find_hparam which now returns 0 for empty keys
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
+        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
+        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
+
+        # MobileNetV5 does not use image_mean/std
+        self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
+        self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
+        self.hparams_vision["image_size"] = self.preprocessor_config.get(
+            "size", {"height": 768, "width": 768}
+        )["height"]
+
+        # Image sequence length (256 tokens = 16x16 for Gemma3n)
+        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        image_size = self.hparams_vision["image_size"]
+        self.hparams_vision["patch_size"] = image_size // image_seq_length
+
+        # remap audio hparams
+        assert self.hparams_audio is not None
+        self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
+        self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
+        self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
+        self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # vision params
+        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+
+        # audio params
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # Force quantization settings for specific tensor types
+        if "input_projection" in name or "input_proj" in name:
+            return gguf.GGMLQuantizationType.F16
+        if ".embeddings." in name or "stem" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def custom_map(self, name: str) -> str:
+        """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
+        parts = name.split(".")
+        # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
+        if len(parts) >= 7:
+            bid, sid = parts[4], parts[5]
+            suffix = ".".join(parts[6:])
+            template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
+            if template in self.block_tensor_mapping:
+                return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
+
+        raise ValueError(f"Unknown name: {name}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if (ConformerAudioModel.is_audio_tensor(name)):
+            name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
+            return super().modify_tensors(data_torch, name, bid)
+
+        # Gemma3n uses
+        # - model.embed_vision.* for projection layers
+        # - model.vision_tower.* for vision encoder
+        # Skip non-vision tensors
+        if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
+            return []
+
+        if name.startswith("model.vision_tower.timm_model.blocks."):
+            # Double-indexed block tensors through custom logic
+            new_name = self.custom_map(name)
+        else:
+            # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
+            new_name = self.map_tensor_name(name)
+
+        if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
+            data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
+
+        return [(new_name, data_torch)]
+
+
+@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
 class Gemma3NModel(Gemma3Model):
    model_arch = gguf.MODEL_ARCH.GEMMA3N
    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
@ -6061,8 +6263,25 @@ class Gemma3NModel(Gemma3Model):
        ]

    def set_vocab(self):
+        # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
+        # which includes special tokens from 262144-262399 for vision/audio.
+        # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
+        # Temporarily override the hparams lookup order to prioritize vocab_size.
+
+        # Store original vocab_size_per_layer_input if it exists
+        vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
+
+        # Temporarily remove vocab_size_per_layer_input to force using vocab_size
+        if vocab_size_per_layer_input is not None:
+            del self.hparams["vocab_size_per_layer_input"]
+
+        # Call parent set_vocab which will now use vocab_size (262400)
        super().set_vocab()

+        # Restore vocab_size_per_layer_input for later use
+        if vocab_size_per_layer_input is not None:
+            self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
@ -6098,8 +6317,32 @@ class Gemma3NModel(Gemma3Model):
        if "language_model." not in name:
            return [] # skip non-language model tensors

+        # Pad token embeddings for vision/audio special tokens (262144-262399)
+        if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
+            # Move to CPU to avoid meta device issues during padding
+            data_torch = data_torch.to(device="cpu")
+
+            vocab_size = self.hparams.get("vocab_size", 262400)
+            current_size = data_torch.shape[0]  # First dimension is vocab_size
+
+            if current_size < vocab_size:
+                # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
+                padding_size = vocab_size - current_size
+                tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
+                logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
+
+                # Create padding with zeros (vision tokens won't use these embeddings)
+                padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
+                data_torch = torch.cat([data_torch, padding], dim=0)
+
+            # Continue with normal processing
+            name = name.replace("language_model.", "")
+            return [(self.map_tensor_name(name), data_torch)]
+
        if "altup_unembed_projections" in name:
            data_torch = data_torch.to(device="cpu")
+            # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
+            # They should NOT be padded
            if ".0." in name:
                self._altup_unembd[0] = data_torch
            elif ".1." in name:
@ -9936,7 +10179,7 @@ class LFM2Model(TextModel):
        self._add_feed_forward_length()

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self._is_vision_tensor(name) or self._is_audio_tensor(name):
+        if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
            # skip multimodal tensors
            return []

@ -9952,9 +10195,6 @@ class LFM2Model(TextModel):
    def _is_vision_tensor(self, name: str) -> bool:
        return "vision_tower" in name or "multi_modal_projector" in name

-    def _is_audio_tensor(self, name: str):
-        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
-

@ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
@ -10082,13 +10322,11 @@ class LFM2VLModel(MmprojModel):


@ModelBase.register("Lfm2AudioForConditionalGeneration")
-class LFM2AudioModel(MmprojModel):
+class LFM2AudioModel(ConformerAudioModel):
    has_vision_encoder = False
    has_audio_encoder = True
    model_name = "Lfm2AudioEncoder"

-    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
-
    def get_audio_config(self) -> dict[str, Any] | None:
        return self.global_config.get("encoder")

@ -10102,12 +10340,7 @@ class LFM2AudioModel(MmprojModel):
        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)

-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".conv" in name and ".weight" in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+    def modify_tensors(self, data_torch, name, bid):
        # skip language model tensors
        if name.startswith("lfm."):
            return []
@ -10120,40 +10353,7 @@ class LFM2AudioModel(MmprojModel):
        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
            return []

-        # fold running_mean, running_var and eps into weight and bias for batch_norm
-        if "batch_norm" in name:
-            if self._batch_norm_tensors is None:
-                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
-            assert bid is not None
-            self._batch_norm_tensors[bid][name] = data_torch
-
-            if len(self._batch_norm_tensors[bid]) < 5:
-                return []
-
-            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
-            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
-            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
-            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
-            eps = 1e-5 # default value
-
-            a = weight / torch.sqrt(running_var + eps)
-            b = bias - running_mean * a
-            return [
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
-            ]
-
-        # reshape conv weights
-        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
-            data_torch = data_torch[:, None, None]
-        if "conv.depthwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[1] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
-        if "conv.pointwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[2] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
-
-        return [(self.map_tensor_name(name), data_torch)]
+        return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("SmallThinkerForCausalLM")
@ -10974,8 +11174,8 @@ def parse_args() -> argparse.Namespace:

    parser.add_argument(
        "--sentence-transformers-dense-modules", action="store_true",
-        help=("Whether to include sentence-transformers dense modules."
-              "It can be used for sentence-transformers models, like google/embeddinggemma-300m"
+        help=("Whether to include sentence-transformers dense modules. "
+              "It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
              "Default these modules are not included.")
    )

--- a/docs/ops.md
+++ b/docs/ops.md
@ -57,7 +57,6 @@ Legend:
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@ -71,10 +70,9 @@ Legend:
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@ -99,7 +97,6 @@ Legend:
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
@ -965,6 +965,7 @@
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
+"BLAS","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
@ -4964,6 +4965,7 @@
 "BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
 "BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
 "BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
+"BLAS","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","BLAS"
 "BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
 "BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
 "BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
@ -5715,15 +5717,15 @@
 "BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
 "BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
 "BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
@ -5733,6 +5735,15 @@
 "BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
@ -6592,6 +6603,30 @@
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
 "BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
+"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q4_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q5_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=q6_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq2_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq2_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq3_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq1_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq1_m,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq4_nl,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq3_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
+"BLAS","MUL_MAT","type_a=iq4_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
@ -8916,6 +8951,11 @@
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
+"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
+"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
+"BLAS","SOFT_MAX","type=f32,ne=[200000,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
+"BLAS","SOFT_MAX","type=f32,ne=[200000,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
+"BLAS","SOFT_MAX","type=f32,ne=[643251,3,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
@ -8968,6 +9008,7 @@
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -8977,6 +9018,7 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -8987,11 +9029,13 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@ -9001,6 +9045,7 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@ -9011,11 +9056,13 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@ -9025,6 +9072,7 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@ -9035,11 +9083,13 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@ -9049,6 +9099,7 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@ -9059,6 +9110,7 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -9184,6 +9236,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -9193,6 +9246,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -9203,11 +9257,13 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@ -9217,6 +9273,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@ -9227,11 +9284,13 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@ -9241,6 +9300,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@ -9251,11 +9311,13 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@ -9265,6 +9327,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@ -9275,6 +9338,7 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
+"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@ -9542,333 +9606,333 @@
 "BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
 "BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
 "BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
 "BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
 "BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
 "BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
@ -9891,8 +9955,9 @@
 "BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
 "BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
 "BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","BLAS"
 "BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
 "BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
 "BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
@ -9914,6 +9979,7 @@
 "BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
 "BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
 "BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
+"BLAS","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","BLAS"
 "BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
 "BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
 "BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
@ -9923,17 +9989,41 @@
 "BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
 "BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
 "BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
+"BLAS","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","BLAS"
+"BLAS","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","BLAS"
+"BLAS","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","BLAS"
+"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/docs/preset.md
+++ b/docs/preset.md
@ -0,0 +1,97 @@
+# llama.cpp INI Presets
+
+## Introduction
+
+The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859), allows users to create reusable and shareable parameter configurations for llama.cpp.
+
+### Using Presets with the Server
+
+When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
+
+### Using a Remote Preset
+
+> [!NOTE]
+>
+> This feature is currently only supported via the `-hf` option.
+
+For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
+
+Example:
+
+```ini
+hf-repo-draft = username/my-draft-model-GGUF
+temp = 0.5
+top-k = 20
+top-p = 0.95
+```
+
+For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
+
+Example usage:
+
+Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
+
+```sh
+llama-cli -hf username/my-model-with-preset
+
+# This is equivalent to:
+llama-cli -hf username/my-model-with-preset \
+  --hf-repo-draft username/my-draft-model-GGUF \
+  --temp 0.5 \
+  --top-k 20 \
+  --top-p 0.95
+```
+
+You can also override preset arguments by specifying them on the command line:
+
+```sh
+# Force temp = 0.1, overriding the preset value
+llama-cli -hf username/my-model-with-preset --temp 0.1
+```
+
+If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
+
+```ini
+hf-repo = user/my-model-main
+hf-repo-draft = user/my-model-draft
+temp = 0.8
+ctx-size = 1024
+; (and other configurations)
+```
+
+### Named presets
+
+If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo containing a single `preset.ini` file that references the actual model(s):
+
+```ini
+[*]
+mmap = 1
+
+[gpt-oss-20b-hf]
+hf          = ggml-org/gpt-oss-20b-GGUF
+batch-size  = 2048
+ubatch-size = 2048
+top-p       = 1.0
+top-k       = 0
+min-p       = 0.01
+temp        = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
+
+[gpt-oss-120b-hf]
+hf          = ggml-org/gpt-oss-120b-GGUF
+batch-size  = 2048
+ubatch-size = 2048
+top-p       = 1.0
+top-k       = 0
+min-p       = 0.01
+temp        = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
+```
+
+You can then use it via `llama-cli` or `llama-server`, example:
+
+```sh
+llama-server -hf user/repo:gpt-oss-120b-hf
+```
+
+Please make sure to provide the correct `hf-repo` for each child preset. Otherwise, you may get error: `The specified tag is not a valid quantization scheme.`
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -553,6 +553,7 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers       = params.n_gpu_layers;
    model_params.devices            = params.devices.data();
    model_params.use_mmap           = params.use_mmap;
+    model_params.use_direct_io      = params.use_direct_io;
    model_params.use_mlock          = params.use_mlock;
    model_params.check_tensors      = params.check_tensors;

--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@ -61,7 +61,7 @@ causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh

 causal-verify-logits: causal-run-original-model causal-run-converted-model
-	@./scripts/causal/compare-logits.py
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}

 causal-run-original-embeddings:
@ -138,16 +138,13 @@ embedding-run-original-model-st: embedding-run-original-model
 embedding-run-converted-model:
 	@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
-	$(if $(USE_POOLING),--pooling)
-
-embedding-run-converted-model-st: USE_POOLING=1
-embedding-run-converted-model-st: embedding-run-converted-model
+	$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")

 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

-embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
+embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -198,14 +198,13 @@ model, and the other is a text file which allows for manual visual inspection.

 #### Using SentenceTransformer with numbered layers
 For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
-03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
+03_Dense, 04_Normalize), these will be applied automatically when running the
+converted model but currently there is a separate target to run the original
+version:

 ```console
 # Run original model with SentenceTransformer (applies all numbered layers)
 (venv) $ make embedding-run-original-model-st
-
-# Run converted model with pooling enabled
-(venv) $ make embedding-run-converted-model-st
 ```

 This will use the SentenceTransformer library to load and run the model, which
@ -213,6 +212,17 @@ automatically applies all the numbered layers in the correct order. This is
 particularly useful when comparing with models that should include these
 additional transformation layers beyond just the base model output.

+The type of normalization can be specified for the converted model but is not
+strictly necessary as the verification uses cosine similarity and the magnitude
+of the output vectors does not affect this. But the normalization type can be
+specified as an argument to the target which might be useful for manual
+inspection:
+```console
+(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
+```
+The original model will apply the normalization according to the normalization
+layer specified in the modules.json configuration file.
+
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -3,10 +3,11 @@
 import sys
 import numpy as np
 from pathlib import Path
+import os

 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
-from common import get_model_name_from_env_path, compare_tokens  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found]

 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
@ -38,6 +39,7 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    return True

 def main():
+    model_path = os.environ.get('MODEL_PATH')
    model_name = get_model_name_from_env_path('MODEL_PATH')
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
@ -62,8 +64,7 @@ def main():
    print("🔍 Token Comparison Check")
    print("=" * 40)
    if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
-        print("\n❌ Token mismatch detected")
-        sys.exit(1)
+        exit_with_warning("\n❌ Token mismatch detected", model_path)
    print()

    print("🔍 GGML Model Validation for model ", model_name)
@ -80,8 +81,7 @@ def main():
        print("       Ok to proceed with NMSE check...")
        sys.exit(0)
    else:
-        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
-        sys.exit(1)
+        exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)

 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@ -5,7 +5,7 @@ set -e
 # Parse command line arguments
 CONVERTED_MODEL=""
 PROMPTS_FILE=""
-USE_POOLING=""
+EMBD_NORMALIZE="2"

 while [[ $# -gt 0 ]]; do
    case $1 in
@ -13,9 +13,9 @@ while [[ $# -gt 0 ]]; do
            PROMPTS_FILE="$2"
            shift 2
            ;;
-        --pooling)
-            USE_POOLING="1"
-            shift
+        --embd-normalize)
+            EMBD_NORMALIZE="$2"
+            shift 2
            ;;
        *)
            if [ -z "$CONVERTED_MODEL" ]; then
@ -51,8 +51,4 @@ fi
 echo $CONVERTED_MODEL

 cmake --build ../../build --target llama-debug -j8
-if [ -n "$USE_POOLING" ]; then
-    ../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding --pooling mean -p "$PROMPT" --save-logits
-else
-    ../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding --pooling none -p "$PROMPT" --save-logits
-fi
+../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@ -3,6 +3,9 @@
 import os
 import sys
 import torch
+import transformers
+import json
+import textwrap
 import numpy as np
 from pathlib import Path

@ -243,3 +246,54 @@ def compare_tokens(original, converted, type_suffix="", output_dir="data"):
        print(f"  ... and {len(mismatches) - num_to_show} more mismatches")

    return False
+
+
+def show_version_warning(current_version, model_version):
+    if not model_version:
+        return False
+
+    try:
+        from packaging.version import parse, InvalidVersion
+        try:
+            return parse(current_version) < parse(model_version)
+        except InvalidVersion:
+            return current_version != model_version
+    except ImportError:
+        return current_version != model_version
+
+def get_model_transformers_version(model_path):
+    if not model_path:
+        return None
+
+    config_path = Path(model_path) / "config.json"
+    if not config_path.is_file():
+        return None
+
+    try:
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return config.get("transformers_version")
+    except (IOError, json.JSONDecodeError) as e:
+        print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
+        return None
+
+def exit_with_warning(message, model_path):
+    print(message)
+
+    if model_path and transformers is not None:
+        model_transformers_version = get_model_transformers_version(model_path)
+        transformers_version       = transformers.__version__
+        if show_version_warning(transformers_version, model_transformers_version):
+            warning_message = f"""
+                =====================================================================
+                Verification failure might be due to a transformers version mismatch:
+
+                Current transformers version: {transformers_version}
+                Model's required version    : {model_transformers_version}
+
+                Consider installing the version specified by the model's config:
+                pip install transformers=={model_transformers_version}
+                =====================================================================
+            """
+            print(textwrap.dedent(warning_message))
+    sys.exit(1)
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@ -7,7 +7,7 @@ import importlib
 from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-from common import compare_tokens  # type: ignore[import-not-found]
+from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

@ -174,8 +174,7 @@ def main():
    print("=" * 70)
    data_dir = python_emb_path.parent
    if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
-        print("\n❌ Token mismatch detected")
-        exit(1)
+        exit_with_warning("\n❌ Token mismatch detected", args.model_path)
    print()

    # Single prompt detailed comparison
@ -237,7 +236,7 @@ def main():
    elif avg_cross_sim > 0.70:
        print("⚠️  FAIR: Models have some differences")
    else:
-        print("❌ POOR: Models are significantly different")
+        exit_with_warning("❌ POOR: Models are significantly different", args.model_path)

 if __name__ == "__main__":
    main()
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -234,6 +234,11 @@

 #if UINTPTR_MAX == 0xFFFFFFFF
    #define GGML_MEM_ALIGN 4
+#elif defined(__EMSCRIPTEN__)
+// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm.
+// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.)
+// ref: https://github.com/ggml-org/llama.cpp/pull/18628
+    #define GGML_MEM_ALIGN 8
 #else
    #define GGML_MEM_ALIGN 16
 #endif
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -144,7 +144,7 @@ extern "C" {
        // device description: short informative description of the device, could be the model name
        const char * (*get_description)(ggml_backend_dev_t dev);

-        // device memory in bytes
+        // device memory in bytes: 0 bytes to indicate no memory to report
        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);

        // device type
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@ -32,14 +32,12 @@ if (BLAS_FOUND)
                pkg_check_modules(DepBLAS openblas)
            endif()
        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-            add_compile_definitions(GGML_BLAS_USE_BLIS)
            pkg_check_modules(DepBLAS blis)
        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
            pkg_check_modules(DepBLAS blas-atlas)
        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
            pkg_check_modules(DepBLAS flexiblas_api)
        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-            add_compile_definitions(GGML_BLAS_USE_MKL)
            # all Intel* libraries share the same include path
            pkg_check_modules(DepBLAS mkl-sdl)
        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@ -74,10 +72,26 @@ if (BLAS_FOUND)

    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

-    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+    if ("${GGML_BLAS_VENDOR}" STREQUAL "")
+        message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
        add_compile_definitions(GGML_BLAS_USE_MKL)
    endif()

+    if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
+        add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
+        add_compile_definitions(GGML_BLAS_USE_BLIS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
+        add_compile_definitions(GGML_BLAS_USE_NVPL)
+    endif()
+
    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
    }

-#if defined(OPENBLAS_VERSION)
+#if defined(GGML_BLAS_USE_OPENBLAS)
    openblas_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_BLIS)
+#elif defined(GGML_BLAS_USE_BLIS)
    bli_thread_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_NVPL)
+#elif defined(GGML_BLAS_USE_NVPL)
    nvpl_blas_set_num_threads(ctx->n_threads);
 #endif

@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
        /* .context = */ ctx,
    };

-#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
    }
@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
        return "BLIS";
    #elif defined(GGML_BLAS_USE_NVPL)
        return "NVPL";
-    #elif defined(OPENBLAS_VERSION)
+    #elif defined(GGML_BLAS_USE_OPENBLAS)
        return "OpenBLAS";
    #else
        return "BLAS";
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
 }

-/**
- * @brief Determines if a tensor operation should be offloaded to the CANN
- * backend.
- *
- * This function checks if a given tensor operation should be offloaded to the
- * CANN backend based on the operation type and the size of the tensor. It
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
- *
- * @param backend Pointer to the CANN backend.
- * @param op Pointer to the tensor operation to check.
- * @return bool Returns true if the operation should be offloaded, otherwise
- * false.
- */
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    GGML_UNUSED(dev);
-
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
-}
-
 /**
 * @brief Records an event on the CANN backend stream.
 *
@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
    int         device;
    std::string name;
    std::string description;
+    int op_offload_min_batch_size;
 };

 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
    return ggml_backend_cann_host_buffer_type();
 }

+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+
+    return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
+
 /**
 * @brief Creates a new event for the CANN backend device.
 *
@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
        if (!initialized) {
            aclInit(nullptr);
            ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

            for (int i = 0; i < ggml_cann_info().device_count; i++) {
                ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
                dev_ctx->description                       = aclrtGetSocName();
                dev_ctx->device                            = i;
                dev_ctx->name                              = GGML_CANN_NAME + std::to_string(i);
+                dev_ctx->op_offload_min_batch_size         = min_batch_size;
                ggml_cann_set_device(i);
                ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface   = */ ggml_backend_cann_device_interface,
                                                                  /* .reg     = */ &reg,
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -47,7 +47,10 @@ if (CUDAToolkit_FOUND)
                #     check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
                # However, the architectures 120a-real and 121a-real should work with basically any CMake version and
                #     until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
-                list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real 121a-real)
+                list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
+            endif()
+            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
+                list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
            endif()
        endif()
    endif()
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4122,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
    std::string name;
    std::string description;
    std::string pci_bus_id;
+    int op_offload_min_batch_size;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@ -4676,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 }

 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;

-    return get_op_batch_size(op) >= min_batch_size;
-
-    GGML_UNUSED(dev);
+    return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
 }

 static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
@ -4848,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
            ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

            for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@ -4861,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                char pci_bus_id[16] = {};
                snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                dev_ctx->pci_bus_id = pci_bus_id;
+                dev_ctx->op_offload_min_batch_size = min_batch_size;

                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -190,7 +190,7 @@ void ggml_cuda_mul_mat_q(
    {
        const int64_t s11 = src1->nb[1] / ts_src1;
        const int64_t s12 = src1->nb[2] / ts_src1;
-        const int64_t s13 = src1->nb[2] / ts_src1;
+        const int64_t s13 = src1->nb[3] / ts_src1;

        if (use_native_mxfp4) {
            quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
-        // RDNA 4 is consistently worse on rocblas
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-            // High expert counts almost always better on MMQ
-            // due to a large amount of graph splits
+            // High expert counts are almost always better on MMQ due to
+            //     the synchronization overhead in the cuBLAS/hipBLAS path:
            // https://github.com/ggml-org/llama.cpp/pull/18202
            if (n_experts >= 64) {
                return true;
            }

+            // For some quantization types MMQ can have lower peak TOPS than hipBLAS
+            //     so it's only faster for sufficiently small batch sizes:
            switch (type) {
-                // These quants are really bad on MMQ
                case GGML_TYPE_Q2_K:
+                    return ne11 <= 128;
                case GGML_TYPE_Q6_K:
-                // These quants are usually worse but not always
+                    return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
-                    return ne11 <= 128;
+                    return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
                default:
                    return true;
            }
        }
+
+        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        return true;
    }

--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@ -219,6 +219,8 @@ struct ggml_metal_device_props {
    bool use_shared_buffers;

    bool supports_gpu_family_apple7;
+
+    int op_offload_min_batch_size;
 };

 ggml_metal_device_t ggml_metal_device_init(void);
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {

            dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];

+            dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
+
            dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
            dev->props.max_working_set_size       = dev->mtl_device.recommendedMaxWorkingSetSize;
            dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 }

 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;

    return (op->op == GGML_OP_MUL_MAT ||
            op->op == GGML_OP_MUL_MAT_ID) &&
-            get_op_batch_size(op) >= min_batch_size;
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(op);
+            get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
 }

 static ggml_backend_device_i ggml_backend_metal_device_i = {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -9148,6 +9148,7 @@ typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t;
 template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@ -57,6 +57,7 @@ set(GGML_OPENCL_KERNELS
    add
    add_id
    argsort
+    fill
    clamp
    cpy
    cvt
@ -120,6 +121,8 @@ set(GGML_OPENCL_KERNELS
    tsembd
    upscale
    tanh
+    expm1
+    softplus
    pad
    repeat
    mul_mat_f16_f32
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@ -489,6 +489,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
    cl_kernel kernel_relu;
    cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
+    cl_kernel kernel_fill;
    cl_kernel kernel_clamp;
    cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
              kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
@ -537,6 +538,10 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_pad;
    cl_kernel kernel_tanh_f32_nd;
    cl_kernel kernel_tanh_f16_nd;
+    cl_kernel kernel_expm1_f32_nd;
+    cl_kernel kernel_expm1_f16_nd;
+    cl_kernel kernel_softplus_f32_nd;
+    cl_kernel kernel_softplus_f16_nd;
    cl_kernel kernel_upscale;
    cl_kernel kernel_upscale_bilinear;
    cl_kernel kernel_concat_f32_contiguous;
@ -787,6 +792,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        GGML_LOG_CONT(".");
    }

+    // fill
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "fill.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("fill.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_fill = clCreateKernel(prog, "kernel_fill_f32", &err), err));
+        GGML_LOG_CONT(".");
+
+        CL_CHECK(clReleaseProgram(prog));
+    }
+
    // clamp
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@ -1780,6 +1803,56 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        }
    }

+    // expm1
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "expm1.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("expm1.cl");
+#endif
+        cl_program prog;
+        if (!kernel_src.empty()) {
+            prog =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
+            CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
+            prog = nullptr;
+            backend_ctx->kernel_expm1_f32_nd = nullptr;
+            backend_ctx->kernel_expm1_f16_nd = nullptr;
+        }
+        CL_CHECK(clReleaseProgram(prog));
+    }
+
+    // softplus
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "softplus.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("softplus.cl");
+#endif
+        cl_program prog;
+        if (!kernel_src.empty()) {
+            prog =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
+            CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
+            prog = nullptr;
+            backend_ctx->kernel_softplus_f32_nd = nullptr;
+            backend_ctx->kernel_softplus_f16_nd = nullptr;
+        }
+        CL_CHECK(clReleaseProgram(prog));
+    }
+
    // upscale
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@ -3089,6 +3162,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                case GGML_UNARY_OP_TANH:
                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+                case GGML_UNARY_OP_EXPM1:
+                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
+                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+                case GGML_UNARY_OP_SOFTPLUS:
+                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
+                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
                default:
                    return false;
            }
@ -3104,6 +3183,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                default:
                    return false;
            }
+        case GGML_OP_FILL:
+            return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
        case GGML_OP_CLAMP:
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_SOFT_MAX:
@ -4266,8 +4347,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
 }

 static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    *free = 1;
-    *total = 1;
+    *free = 0;
+    *total = 0;

    GGML_UNUSED(dev);
 }
@ -5860,6 +5941,36 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }

+static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src0);
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float v = 0.0f;
+    memcpy(&v, ((int32_t *) dst->op_params), sizeof(float));
+
+    const int64_t n = ggml_nelements(dst);
+
+    cl_kernel kernel = backend_ctx->kernel_fill;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(float),    &v));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(float),    &n));
+
+    size_t local_work_size[1] = { 256 };
+    size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
+}
+
 static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0);
    GGML_ASSERT(src0->extra);
@ -6413,6 +6524,210 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }

+static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
+    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+    if (dst->type == GGML_TYPE_F32) {
+        kernel = backend_ctx->kernel_expm1_f32_nd;
+    } else if (dst->type == GGML_TYPE_F16) {
+        kernel = backend_ctx->kernel_expm1_f16_nd;
+    } else {
+        GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
+    }
+    GGML_ASSERT(kernel != nullptr);
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const int ne10 = dst->ne[0];
+    const int ne11 = dst->ne[1];
+    const int ne12 = dst->ne[2];
+    const int ne13 = dst->ne[3];
+
+    const cl_ulong nb10 = dst->nb[0];
+    const cl_ulong nb11 = dst->nb[1];
+    const cl_ulong nb12 = dst->nb[2];
+    const cl_ulong nb13 = dst->nb[3];
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
+
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
+
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
+
+    size_t global_work_size[3];
+    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
+        return;
+    }
+    global_work_size[0] = (size_t)ne10;
+    global_work_size[1] = (size_t)ne11;
+    global_work_size[2] = (size_t)ne12;
+
+    size_t lws0 = 16, lws1 = 4, lws2 = 1;
+    if (ne10 < 16) lws0 = ne10;
+    if (ne11 < 4) lws1 = ne11;
+    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
+
+    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
+
+
+    size_t local_work_size[] = {lws0, lws1, lws2};
+
+    size_t* local_work_size_ptr = local_work_size;
+    if (!backend_ctx->non_uniform_workgroups) {
+        if (global_work_size[0] % local_work_size[0] != 0 ||
+            global_work_size[1] % local_work_size[1] != 0 ||
+            global_work_size[2] % local_work_size[2] != 0) {
+            local_work_size_ptr = NULL;
+        }
+    }
+    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+}
+
+static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
+    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+    if (dst->type == GGML_TYPE_F32) {
+        kernel = backend_ctx->kernel_softplus_f32_nd;
+    } else if (dst->type == GGML_TYPE_F16) {
+        kernel = backend_ctx->kernel_softplus_f16_nd;
+    } else {
+        GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
+    }
+    GGML_ASSERT(kernel != nullptr);
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const int ne10 = dst->ne[0];
+    const int ne11 = dst->ne[1];
+    const int ne12 = dst->ne[2];
+    const int ne13 = dst->ne[3];
+
+    const cl_ulong nb10 = dst->nb[0];
+    const cl_ulong nb11 = dst->nb[1];
+    const cl_ulong nb12 = dst->nb[2];
+    const cl_ulong nb13 = dst->nb[3];
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
+
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
+
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
+
+    size_t global_work_size[3];
+    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
+        return;
+    }
+    global_work_size[0] = (size_t)ne10;
+    global_work_size[1] = (size_t)ne11;
+    global_work_size[2] = (size_t)ne12;
+
+    size_t lws0 = 16, lws1 = 4, lws2 = 1;
+    if (ne10 < 16) lws0 = ne10;
+    if (ne11 < 4) lws1 = ne11;
+    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
+
+    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
+
+
+    size_t local_work_size[] = {lws0, lws1, lws2};
+
+    size_t* local_work_size_ptr = local_work_size;
+    if (!backend_ctx->non_uniform_workgroups) {
+        if (global_work_size[0] % local_work_size[0] != 0 ||
+            global_work_size[1] % local_work_size[1] != 0 ||
+            global_work_size[2] % local_work_size[2] != 0) {
+            local_work_size_ptr = NULL;
+        }
+    }
+    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+}
+
 static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
    GGML_ASSERT(src0);
    GGML_ASSERT(src0->extra);
@ -9586,6 +9901,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
                    }
                    func = ggml_cl_tanh;
                    break;
+                case GGML_UNARY_OP_EXPM1:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_expm1;
+                    break;
+                case GGML_UNARY_OP_SOFTPLUS:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_softplus;
+                    break;
                default:
                    return false;
            } break;
@ -9595,6 +9922,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            }
            func = ggml_cl_glu;
            break;
+        case GGML_OP_FILL:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_fill;
+            break;
        case GGML_OP_CLAMP:
            if (!any_on_device) {
                return false;
--- a/ggml/src/ggml-opencl/kernels/expm1.cl
+++ b/ggml/src/ggml-opencl/kernels/expm1.cl
@ -0,0 +1,82 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// expm1
+//------------------------------------------------------------------------------
+kernel void kernel_expm1_f32_nd(
+        global void * p_src0_base,
+        ulong off_src0_abs,
+        global void * p_dst_base,
+        ulong off_dst_abs,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = exp(*src_val_ptr) - 1;
+        }
+    }
+}
+
+kernel void kernel_expm1_f16_nd(
+        global void * p_src0_base,
+        ulong off_src0_abs,
+        global void * p_dst_base,
+        ulong off_dst_abs,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = exp(*src_val_ptr) - 1;
+        }
+    }
+}
--- a/ggml/src/ggml-opencl/kernels/fill.cl
+++ b/ggml/src/ggml-opencl/kernels/fill.cl
@ -0,0 +1,17 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// fill
+//------------------------------------------------------------------------------
+__kernel void kernel_fill_f32(
+        __global float *dst,
+        ulong offsetd,
+        float v,
+        int n
+
+) {
+    dst = (global float*)((global char*)dst + offsetd);
+    if(get_global_id(0) < n){
+        dst[get_global_id(0)] = v;
+    }
+}
--- a/ggml/src/ggml-opencl/kernels/softplus.cl
+++ b/ggml/src/ggml-opencl/kernels/softplus.cl
@ -0,0 +1,88 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// softplus
+//------------------------------------------------------------------------------
+inline float softplus_f32(float x){
+    float ax = fabs(x);
+    float m = fmax(x, 0.0f);
+    return log1p(exp(-ax)) + m;
+}
+
+kernel void kernel_softplus_f32_nd(
+        global void * p_src0_base,
+        ulong off_src0_abs,
+        global void * p_dst_base,
+        ulong off_dst_abs,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = softplus_f32(*src_val_ptr);
+        }
+    }
+}
+
+kernel void kernel_softplus_f16_nd(
+        global void * p_src0_base,
+        ulong off_src0_abs,
+        global void * p_dst_base,
+        ulong off_dst_abs,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
+        }
+    }
+}
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
    int device;
    std::string name;
    std::string description;
+    int op_offload_min_batch_size;
 };

 static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 }

 static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    return get_op_batch_size(op) >= min_batch_size;
-    GGML_UNUSED(dev);
+    ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
 }

 static ggml_backend_event_t
@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
            ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

            for (int i = 0; i < ggml_sycl_info().device_count; i++) {
                ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
                    prop, dpct::dev_mgr::instance().get_device(i))));

                dev_ctx->description = prop.get_name();
+                dev_ctx->op_offload_min_batch_size = min_batch_size;

                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface       = */ ggml_backend_sycl_device_interface,
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -570,6 +570,7 @@ struct vk_device_struct {
    bool uma;
    bool prefer_host_memory;
    bool float_controls_rte_fp16;
+    bool subgroup_basic;
    bool subgroup_arithmetic;
    bool subgroup_shuffle;
    bool subgroup_ballot;
@ -1504,6 +1505,11 @@ template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
    init_fastdiv_values(p.ne01,        p.ne0_1mp,  p.ne0_1L);
 }

+struct vk_quantize_q8_1_push_constants {
+    uint32_t ne;
+    uint32_t num_blocks;
+};
+
 // Allow pre-recording command buffers
 struct vk_staging_memcpy {
    vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@ -2996,6 +3002,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
            m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
            m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
+        } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
+            // Xe2/Xe3 with coopmat enabled - warptile performance tuning
+            l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
+            l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
        }

        l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
@ -3336,12 +3346,12 @@ static void ggml_vk_load_shaders(vk_device& device) {

        GGML_ASSERT(device->subgroup_ballot);

-        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
-        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
-        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
 #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
        if (device->coopmat_bf16_support) {
-            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
+            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
        }
 #endif

@ -3449,9 +3459,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif

        if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
-            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
-            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
-            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);

            CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
@ -3493,9 +3503,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
            }
 #endif
        } else {
-            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
-            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
-            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);

            CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
@ -3610,9 +3620,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif

        if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
-            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
-            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
-            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
+            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);

            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
@ -3636,9 +3646,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc,  matmul_id_subgroup_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
            CREATE_MM(GGML_TYPE_MXFP4,   pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc,   matmul_id_subgroup_mxfp4_f32,   , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
        } else {
-            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
-            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
-            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
+            CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
            CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);

            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
@ -3678,6 +3688,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
        m_wg_denoms = { 64,  64, 1 };
        s_wg_denoms = { 32,  32, 1 };

+        if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2) {
+            // Xe2/Xe3 - bf16 warptile performance tuning
+            l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 };
+        }
+
        CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
        CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
    }
@ -3831,22 +3846,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
            const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
            const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);

-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);

-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);

-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);

-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
        }
 #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
    }
@ -3934,9 +3949,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);

    if (device->subgroup_clustered && device->subgroup_require_full_support) {
-        ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
+        ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
    } else {
-        ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
    }

    for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@ -4144,9 +4159,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);

 #define CREATE_GLU(name)  \
    if (device->float_controls_rte_fp16) {  \
@ -4292,8 +4307,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);

    if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
-        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
-        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
+        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true);
+        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true);
    } else {
        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
        ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
@ -4629,6 +4644,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
        }
        device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;

+        device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
+                                 (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic);
        device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
                                      (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
 #ifdef __APPLE__
@ -5061,11 +5078,23 @@ static vk_device ggml_vk_get_device(size_t idx) {
            switch (device->vendor_id) {
 #ifndef GGML_VULKAN_RUN_TESTS
            case VK_VENDOR_ID_AMD:
+                device->mul_mat_l[i]    = false;
+                device->mul_mat_m[i]    = true;
+                device->mul_mat_s[i]    = true;
+                device->mul_mat_id_l[i] = false;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = true;
+                break;
            case VK_VENDOR_ID_INTEL:
-                device->mul_mat_l[i] = false;
+                if (!device->coopmat_support || device->architecture != INTEL_XE2) {
+                    device->mul_mat_l[i] = false;
+                    device->mul_mat_id_l[i] = false;
+                } else {
+                    device->mul_mat_l[i] = true;  // if coopmat & XE2+, allow large matmul warptile config for Intel
+                    device->mul_mat_id_l[i] = true;
+                }
                device->mul_mat_m[i] = true;
                device->mul_mat_s[i] = true;
-                device->mul_mat_id_l[i] = false;
                device->mul_mat_id_m[i] = true;
                device->mul_mat_id_s[i] = true;
                break;
@ -6076,6 +6105,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
    GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
    GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
    GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
+    GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));

    vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
    vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
@ -6858,7 +6888,12 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
    const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
    const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));

-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 2>{ ne, num_blocks }, { elements, 1, 1 });
+    const vk_quantize_q8_1_push_constants pc = {
+        ne,
+        num_blocks,
+    };
+
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 });
    ggml_vk_sync_buffers(ctx, subctx);
 }

@ -9849,8 +9884,9 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,

    std::array<uint32_t, 3> elements;

-    const int splitH = 16;
-    const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, splitH);
+    const uint32_t d_state = src0->ne[0];
+    uint32_t num_subgroups = d_state / ctx->device->subgroup_size;
+    const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups);
    const uint32_t num_workgroups_y = n_seq;
    elements = { num_workgroups_x, num_workgroups_y, 1 };

@ -14228,6 +14264,7 @@ struct ggml_backend_vk_device_context {
    std::string description;
    bool is_integrated_gpu;
    std::string pci_bus_id;
+    int op_offload_min_batch_size;
 };

 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
@ -14284,6 +14321,19 @@ static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const
 }

 static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    const vk_device& device = ggml_vk_get_device(ctx->device);
+
+    // reject any tensors larger than the max buffer size
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) {
+            return false;
+        }
+    }
+    if (ggml_nbytes(op) > device->max_buffer_size) {
+        return false;
+    }
+
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
@ -14332,8 +14382,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_MUL_MAT_ID:
            {
                ggml_type src0_type = op->src[0]->type;
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                const vk_device& device = ggml_vk_get_device(ctx->device);
                if (op->op == GGML_OP_MUL_MAT_ID) {
                    if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
                        // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
@ -14394,8 +14442,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            }
        case GGML_OP_FLASH_ATTN_EXT:
            {
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                auto device = ggml_vk_get_device(ctx->device);
                bool coopmat2 = device->coopmat2;
                uint32_t HSK = op->src[1]->ne[0];
                uint32_t HSV = op->src[2]->ne[0];
@ -14617,8 +14663,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
                    return false;
                }
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                auto device = ggml_vk_get_device(ctx->device);
                // pipeline_argsort_large_f32 requires vulkan memory model.
                if (device->vulkan_memory_model) {
                    return true;
@ -14631,8 +14675,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
                    return false;
                }
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                auto device = ggml_vk_get_device(ctx->device);
                // We could potentially support larger, using argsort to sort the
                // whole thing. Not clear if this is needed.
                uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1;
@ -14679,8 +14721,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_CUMSUM:
            {
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                auto device = ggml_vk_get_device(ctx->device);
                if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
                    return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
                }
@ -14688,9 +14728,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            }
        case GGML_OP_SOLVE_TRI:
            {
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                const vk_device& device = ggml_vk_get_device(ctx->device);
-
                if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) {
                    return false;
                }
@ -14755,14 +14792,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    return false;
                }

-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                const vk_device& device = ggml_vk_get_device(ctx->device);
+                size_t shmem_size = d_state * sizeof(float);

-                const uint32_t SPLIT_H = 16;
+                if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) {
+                    return false;
+                }

-                size_t stateC_size = SPLIT_H * d_state * sizeof(float);
-
-                if (stateC_size > device->properties.limits.maxComputeSharedMemorySize) {
+                if (!device->subgroup_basic) {
                    return false;
                }

@ -14802,12 +14838,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
 }

 static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;

-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
-
-    UNUSED(dev);
+    return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
 }

 static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
@ -14933,6 +14967,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
        static std::mutex mutex;
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
            for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
                ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
                char desc[256];
@ -14942,6 +14977,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                ctx->description = desc;
                ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+                ctx->op_offload_min_batch_size = min_batch_size;
                devices.push_back(new ggml_backend_device {
                    /* .iface   = */ ggml_backend_vk_device_i,
                    /* .reg     = */ reg,
--- a/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp
@ -1,6 +1,7 @@
 #version 450

 #extension GL_EXT_control_flow_attributes : require
+#extension GL_KHR_shader_subgroup_basic : enable
 #if USE_SUBGROUP_ADD
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #endif
@ -9,7 +10,8 @@

 layout(constant_id = 0) const uint D_STATE = 128;
 layout(constant_id = 1) const uint SUBGROUP_SIZE = 32;
-layout(constant_id = 2) const uint SPLIT_H = 16;
+
+const uint32_t c_factor = D_STATE / SUBGROUP_SIZE;

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

@ -41,22 +43,28 @@ float softplus(float x) {
    }
 }

-shared float stateC[SPLIT_H * D_STATE];
+#if !USE_SUBGROUP_ADD
+shared float temp[D_STATE];
+#endif

 void main() {
-    const uint tid = gl_LocalInvocationID.x;
-    const uint head_idx = (gl_WorkGroupID.x * SPLIT_H) / d_head;
-    const uint head_off = ((gl_WorkGroupID.x * SPLIT_H) % d_head) * 4;
-    const uint seq_idx = gl_WorkGroupID.y;
+    const uint subgroup = gl_SubgroupID;
+    const uint lane     = gl_SubgroupInvocationID;
+    const uint tid      = gl_SubgroupID * SUBGROUP_SIZE + lane;
+    const uint subgroup_idx = gl_WorkGroupID.x  * c_factor + subgroup;
+
+    const uint head_idx =  subgroup_idx / d_head;
+    const uint head_off = (subgroup_idx % d_head) * 4;
+    const uint seq_idx  = gl_WorkGroupID.y;

    const uint group_off = (head_idx / (n_head / n_group)) * D_STATE * 4;
    const uint s0_base_idx = (uint(ids[seq_idx]) * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;
-    const uint x_base_idx = (seq_idx * nb13 + gl_WorkGroupID.x * SPLIT_H * 4) / 4;
+    const uint x_base_idx = (seq_idx * nb13 + subgroup_idx * 4) / 4;
    const uint dt_base_idx = (seq_idx * nb22 + head_idx * 4) / 4;
    const uint A_base_idx = (head_idx * nb31) / 4;
    const uint B_base_idx = (seq_idx * nb43 + group_off) / 4;
    const uint C_base_idx = (seq_idx * nb53 + group_off) / 4;
-    const uint y_base_idx = seq_idx * n_tok * n_head * d_head + gl_WorkGroupID.x * SPLIT_H;
+    const uint y_base_idx = seq_idx * n_tok * n_head * d_head + subgroup_idx;
    const uint s_base_idx = (s_off + seq_idx * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;

    const uint stride_x = nb12 / 4;
@ -65,76 +73,52 @@ void main() {
    const uint stride_C = nb52 / 4;
    const uint stride_y = n_head * d_head;

-    float state[SPLIT_H];
-    [[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
-        state[j] = s0[s0_base_idx + j * D_STATE + tid];
+    float state[c_factor];
+
+    [[unroll]] for (uint j = 0; j < c_factor; j++) {
+        state[j] = s0[s0_base_idx + SUBGROUP_SIZE * j + lane];
    }

+    float a = A[A_base_idx];
+
    for (uint i = 0; i < n_tok; i++) {
-        const float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);
+        float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);

-        const float dA = exp(dt_soft_plus * A[A_base_idx]);
-
-        const float B_val = B[B_base_idx + i * stride_B + tid];
-        const float C_val = C[C_base_idx + i * stride_C + tid];
-
-        [[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
-            const float x_dt = x[x_base_idx + i * stride_x + j] * dt_soft_plus;
+        float state_sum = 0.0f;

+        const float dA   = exp(dt_soft_plus * a);
+        const float x_dt = x[x_base_idx + i * stride_x] * dt_soft_plus;
+        [[unroll]] for (uint j = 0; j < c_factor; j++) {
+            float B_val = B[B_base_idx + i * stride_B + SUBGROUP_SIZE * j + lane];
+            float C_val = C[C_base_idx + i * stride_C + SUBGROUP_SIZE * j + lane];
            state[j] = (state[j] * dA) + (B_val * x_dt);
-
-            stateC[j * D_STATE + tid] = state[j] * C_val;
+            state_sum += state[j] * C_val;
        }

+#if USE_SUBGROUP_ADD
+        state_sum = subgroupAdd(state_sum);
+#else
+        temp[tid] = state_sum;
        barrier();
-        [[unroll]]
-        for (uint w = D_STATE / 2; w >= SUBGROUP_SIZE; w >>= 1) {
-            [[unroll]] for (uint j = 0; j < (w * SPLIT_H + D_STATE - 1) / D_STATE; j++) {
-                const uint k = (tid % w) + (D_STATE * (tid / w)) + j * D_STATE * (D_STATE / w);
-                if (k < SPLIT_H * D_STATE && (k + w) < SPLIT_H * D_STATE) {
-                    stateC[k] += stateC[k + w];
-                }
+        [[unroll]] for (uint s = SUBGROUP_SIZE / 2; s > 0; s >>= 1) {
+            if (lane < s) {
+                temp[tid] += temp[tid + s];
            }
            barrier();
        }
-
-        [[unroll]] for (uint j = 0; j < max(1, SPLIT_H / (D_STATE / SUBGROUP_SIZE)); j++) {
-            const uint idx = (tid % SUBGROUP_SIZE) +
-                            D_STATE * (tid / SUBGROUP_SIZE) +
-                            j * D_STATE * (D_STATE / SUBGROUP_SIZE);
-            const uint max_idx = SUBGROUP_SIZE - 1 +
-                            D_STATE * ((D_STATE - 1) / SUBGROUP_SIZE) +
-                            j * D_STATE * (D_STATE / SUBGROUP_SIZE);
-
-            if (idx < SPLIT_H * D_STATE ||
-                max_idx < SPLIT_H * D_STATE) {
-                float sc;
-#if USE_SUBGROUP_ADD
-                sc = stateC[idx];
-                sc = subgroupAdd(sc);
-#else
-                [[unroll]] for (uint offset = SUBGROUP_SIZE / 2; offset > 0; offset >>= 1) {
-                    if (idx + offset < SPLIT_H * D_STATE) {
-                        stateC[idx] += stateC[idx + offset];
-                    }
-                    barrier();
-                }
-                if (tid % SUBGROUP_SIZE == 0) {
-                    sc = stateC[idx];
-                }
+        // get the value from lane 0
+        state_sum = temp[subgroup * SUBGROUP_SIZE];
+        barrier();
 #endif

-                if (tid % SUBGROUP_SIZE == 0) {
-                    const uint k = tid / SUBGROUP_SIZE + j * (D_STATE / SUBGROUP_SIZE);
-                    d[y_base_idx + i * stride_y + k] = sc;
-                }
-            }
+        if (lane == 0) {
+            d[y_base_idx + i * stride_y] = state_sum;
        }
-
-        barrier();
    }

-    [[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
-        d[s_base_idx + j * D_STATE + tid] = state[j];
+    // write back the state
+    [[unroll]]
+    for (int j = 0; j < c_factor; j++) {
+        d[s_base_idx + SUBGROUP_SIZE * j + lane] = state[j];
    }
 }
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@ -0,0 +1,169 @@
+#ifndef GGML_WEBGPU_SHADER_LIB_HPP
+#define GGML_WEBGPU_SHADER_LIB_HPP
+
+#include "ggml.h"
+#include "pre_wgsl.hpp"
+
+#include <string>
+#include <vector>
+
+#define GGML_WEBGPU_F16_SIZE_BYTES                   2
+#define GGML_WEBGPU_F32_SIZE_BYTES                   4
+#define GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES 8u
+#define GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE     128u
+// Matches GGML_PAD(..., 256) in src/llama-context.cpp for KV cache sizing.
+#define GGML_WEBGPU_KV_SEQ_PAD                       256u
+
+struct ggml_webgpu_flash_attn_shader_lib_context {
+    ggml_type kv_type;
+    uint32_t  head_dim_qk;
+    uint32_t  head_dim_v;
+    bool      kv_direct;
+    bool      has_mask;
+    bool      has_sinks;
+    bool      uses_logit_softcap;
+    uint32_t  sg_mat_m;
+    uint32_t  sg_mat_n;
+    uint32_t  sg_mat_k;
+    size_t    wg_mem_limit_bytes;
+    uint32_t  max_subgroup_size;
+};
+
+struct ggml_webgpu_flash_attn_shader_decisions {
+    uint32_t q_tile  = 0;
+    uint32_t kv_tile = 0;
+    uint32_t wg_size = 0;
+};
+
+struct ggml_webgpu_processed_shader {
+    std::string                             wgsl;
+    std::string                             variant;
+    ggml_webgpu_flash_attn_shader_decisions decisions;
+};
+
+// This is exposed because it's necessary in supports_op
+inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
+                                                  uint32_t kv_tile,
+                                                  uint32_t head_dim_qk,
+                                                  uint32_t head_dim_v,
+                                                  bool     has_mask,
+                                                  bool     kv_direct) {
+    const uint32_t max_head_dim = std::max(head_dim_qk, head_dim_v);
+    size_t         f16_elems    = 0;
+    size_t         f32_elems    = 0;
+    f16_elems += q_tile * head_dim_qk;        // q_shmem
+    if (!kv_direct) {
+        f16_elems += kv_tile * max_head_dim;  // kv_shmem
+    }
+    f16_elems += q_tile * head_dim_v;         // o_shmem
+    if (has_mask) {
+        f16_elems += q_tile * kv_tile;        // mask_shmem
+    }
+    f16_elems += q_tile * kv_tile;            // inter_shmem
+    f32_elems += q_tile;                      // row_max_shmem
+    f32_elems += q_tile;                      // exp_sum_shmem
+    return f16_elems * GGML_WEBGPU_F16_SIZE_BYTES + f32_elems * GGML_WEBGPU_F32_SIZE_BYTES;
+}
+
+static uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_flash_attn_shader_lib_context & context) {
+    const size_t limit_bytes  = context.wg_mem_limit_bytes;
+    const size_t q_tile       = context.sg_mat_m;
+    const size_t base_q_bytes = (context.head_dim_qk + context.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
+                                2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
+    size_t bytes_per_kv = 0;
+    if (!context.kv_direct) {
+        bytes_per_kv += std::max(context.head_dim_qk, context.head_dim_v);
+    }
+    if (context.has_mask) {
+        bytes_per_kv += q_tile;
+    }
+    bytes_per_kv += q_tile;
+    bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
+    const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
+    return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
+}
+
+inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_shader(
+    pre_wgsl::Preprocessor &                          preprocessor,
+    const char *                                      shader_src,
+    const ggml_webgpu_flash_attn_shader_lib_context & context) {
+    std::vector<std::string> defines;
+    std::string              variant = "flash_attn";
+
+    switch (context.kv_type) {
+        case GGML_TYPE_F32:
+            defines.push_back("KV_F32");
+            break;
+        case GGML_TYPE_F16:
+            defines.push_back("KV_F16");
+            break;
+        case GGML_TYPE_Q4_0:
+            defines.push_back("KV_Q4_0");
+            break;
+        case GGML_TYPE_Q8_0:
+            defines.push_back("KV_Q8_0");
+            break;
+        default:
+            GGML_ABORT("Unsupported KV type for flash attention shader");
+    }
+    variant += std::string("_") + ggml_type_name(context.kv_type);
+
+    if (context.has_mask) {
+        defines.push_back("MASK");
+        variant += "_mask";
+    }
+    if (context.has_sinks) {
+        defines.push_back("SINKS");
+        variant += "_sinks";
+    }
+    if (context.uses_logit_softcap) {
+        defines.push_back("LOGIT_SOFTCAP");
+        variant += "_lgsc";
+    }
+
+    if (context.kv_direct) {
+        defines.push_back("KV_DIRECT");
+        variant += "_kvdirect";
+    }
+
+    defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(context.head_dim_qk));
+    variant += std::string("_hsqk") + std::to_string(context.head_dim_qk);
+
+    defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(context.head_dim_v));
+    variant += std::string("_hsv") + std::to_string(context.head_dim_v);
+
+    // For now these are not part of the variant name
+    defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
+    defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
+    defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
+
+    // Add chosen Q/KV tile sizes
+    uint32_t q_tile  = context.sg_mat_m;
+    uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
+                                context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
+    if (context.kv_direct) {
+        GGML_ASSERT(kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
+        // Avoids having to use bounds-checks and decreasing performance for direct KV loads
+        while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
+            kv_tile -= context.sg_mat_n;
+        }
+    }
+
+    defines.push_back(std::string("Q_TILE=") + std::to_string(q_tile));
+    defines.push_back(std::string("KV_TILE=") + std::to_string(kv_tile));
+
+    // workgroup size
+    uint32_t wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
+
+    defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
+
+    ggml_webgpu_processed_shader result;
+    result.wgsl              = preprocessor.preprocess(shader_src, defines);
+    result.variant           = variant;
+    result.decisions.q_tile  = q_tile;
+    result.decisions.kv_tile = kv_tile;
+    result.decisions.wg_size = wg_size;
+    return result;
+}
+
+#endif  // GGML_WEBGPU_SHADER_LIB_HPP
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -7,7 +7,9 @@

 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
+#include "ggml-webgpu-shader-lib.hpp"
 #include "ggml-wgsl-shaders.hpp"
+#include "pre_wgsl.hpp"

 #ifdef __EMSCRIPTEN__
 #    include <emscripten/emscripten.h>
@ -17,6 +19,7 @@

 #include <atomic>
 #include <condition_variable>
+#include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <map>
@ -30,7 +33,7 @@

 #ifdef GGML_WEBGPU_DEBUG
 #    define WEBGPU_LOG_DEBUG(msg)  std::cout << msg << std::endl
-#    define WEBGPU_DEBUG_BUF_ELEMS 32
+#    define WEBGPU_DEBUG_BUF_ELEMS 512
 #else
 #    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif  // GGML_WEBGPU_DEBUG
@ -251,6 +254,7 @@ struct webgpu_gpu_profile_buf_pool {
 struct webgpu_pipeline {
    wgpu::ComputePipeline pipeline;
    std::string           name;
+    void *                context = nullptr;
 };

 struct webgpu_command {
@ -263,6 +267,46 @@ struct webgpu_command {
 #endif
 };

+struct flash_attn_pipeline_key {
+    int      q_type;
+    int      kv_type;
+    int      dst_type;
+    uint32_t head_dim_qk;
+    uint32_t head_dim_v;
+    bool     kv_direct;
+    bool     has_mask;
+    bool     has_sinks;
+    bool     uses_logit_softcap;
+
+    bool operator==(const flash_attn_pipeline_key & other) const {
+        return q_type == other.q_type && kv_type == other.kv_type && dst_type == other.dst_type &&
+               head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v && kv_direct == other.kv_direct &&
+               has_mask == other.has_mask && has_sinks == other.has_sinks &&
+               uses_logit_softcap == other.uses_logit_softcap;
+    }
+};
+
+// Same hash combine function as in boost
+template <typename T> inline void ggml_webgpu_hash_combine(size_t & seed, const T & value) {
+    seed ^= std::hash<T>{}(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+struct flash_attn_pipeline_key_hash {
+    size_t operator()(const flash_attn_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.q_type);
+        ggml_webgpu_hash_combine(seed, key.kv_type);
+        ggml_webgpu_hash_combine(seed, key.dst_type);
+        ggml_webgpu_hash_combine(seed, key.head_dim_qk);
+        ggml_webgpu_hash_combine(seed, key.head_dim_v);
+        ggml_webgpu_hash_combine(seed, key.kv_direct);
+        ggml_webgpu_hash_combine(seed, key.has_mask);
+        ggml_webgpu_hash_combine(seed, key.has_sinks);
+        ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
+        return seed;
+    }
+};
+
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
    wgpu::Instance instance;
@ -271,12 +315,12 @@ struct webgpu_context_struct {
    wgpu::Queue    queue;
    wgpu::Limits   limits;

-    uint32_t subgroup_size;
+    uint32_t max_subgroup_size;

-#ifndef __EMSCRIPTEN__
-    bool                       supports_subgroup_matrix = false;
-    wgpu::SubgroupMatrixConfig subgroup_matrix_config;
-#endif
+    bool     supports_subgroup_matrix = false;
+    uint32_t sg_mat_m;
+    uint32_t sg_mat_n;
+    uint32_t sg_mat_k;

    std::recursive_mutex mutex;
    std::atomic_uint     inflight_threads = 0;
@ -284,20 +328,24 @@ struct webgpu_context_struct {
    webgpu_buf_pool param_buf_pool;
    webgpu_buf_pool set_rows_error_buf_pool;

+    pre_wgsl::Preprocessor p;
+
    std::map<int, webgpu_pipeline> memset_pipelines;                                 // variant or type index

    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines;  // src0_type, src1_type, vectorized
    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
        mul_mat_vec_pipelines;                                                       // src0_type, src1_type, vectorized

-    std::map<int, std::map<int, webgpu_pipeline>> set_rows_pipelines;                // dst_type, vectorized
-    std::map<int, std::map<int, webgpu_pipeline>> get_rows_pipelines;                // src_type, vectorized
+    std::unordered_map<flash_attn_pipeline_key, webgpu_pipeline, flash_attn_pipeline_key_hash> flash_attn_pipelines;

-    std::map<int, std::map<int, webgpu_pipeline>> cpy_pipelines;                     // src_type, dst_type
-    std::map<int, std::map<int, webgpu_pipeline>> add_pipelines;                     // type, inplace
-    std::map<int, std::map<int, webgpu_pipeline>> sub_pipelines;                     // type, inplace
-    std::map<int, std::map<int, webgpu_pipeline>> mul_pipelines;                     // type, inplace
-    std::map<int, std::map<int, webgpu_pipeline>> div_pipelines;                     // type, inplace
+    std::map<int, std::map<int, webgpu_pipeline>> set_rows_pipelines;                 // dst_type, vectorized
+    std::map<int, std::map<int, webgpu_pipeline>> get_rows_pipelines;                 // src_type, vectorized
+
+    std::map<int, std::map<int, webgpu_pipeline>> cpy_pipelines;                      // src_type, dst_type
+    std::map<int, std::map<int, webgpu_pipeline>> add_pipelines;                      // type, inplace
+    std::map<int, std::map<int, webgpu_pipeline>> sub_pipelines;                      // type, inplace
+    std::map<int, std::map<int, webgpu_pipeline>> mul_pipelines;                      // type, inplace
+    std::map<int, std::map<int, webgpu_pipeline>> div_pipelines;                      // type, inplace

    std::map<int, webgpu_pipeline>                               rms_norm_pipelines;  // inplace
    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> rope_pipelines;      // type, ff, inplace
@ -361,8 +409,6 @@ struct ggml_backend_webgpu_buffer_context {
        label(std::move(lbl)) {}
 };

-/* End struct definitions */
-
 /* WebGPU object initializations */

 // Process a WGSL shader string, replacing tokens of the form {{KEY}} with
@ -484,14 +530,9 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
    encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
    wgpu::CommandBuffer commands = encoder.Finish();
    ctx->queue.Submit(1, &commands);
-
    ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize());
-    const uint32_t * debug_data = (const uint32_t *) ctx->debug_host_buf.GetConstMappedRange();
-    std::cout << "debug data:";
-    for (size_t i = 0; i < WEBGPU_DEBUG_BUF_ELEMS; i++) {
-        std::cout << "  " << i << ": " << debug_data[i];
-    }
-    std::cout << "\n";
+    const float * debug_data = (const float *) ctx->debug_host_buf.GetConstMappedRange();
+    std::cout << "debug[0]: " << debug_data[0] << "\n";
    ctx->debug_host_buf.Unmap();
 }
 #endif
@ -673,6 +714,7 @@ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
    return ctx->name.c_str();
 }

+// TODO: implement proper cleanup
 static void ggml_backend_webgpu_free(ggml_backend_t backend) {
    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
@ -730,12 +772,12 @@ static wgpu::Buffer ggml_webgpu_tensor_buf(const ggml_tensor * tensor) {
    return ctx->buffer;
 }

-static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, ggml_tensor * t) {
+static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, const ggml_tensor * t) {
    size_t offset = ggml_webgpu_tensor_offset(t);
    return offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
 }

-static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor * t) {
+static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, const ggml_tensor * t) {
    size_t offset = ggml_webgpu_tensor_offset(t);
    return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 }
@ -964,12 +1006,10 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
 #ifndef __EMSCRIPTEN__
            if (ctx->supports_subgroup_matrix) {
                // The total number of subgroups/workgroups needed per matrix.
-                uint32_t wg_m_sg_tile =
-                    WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->subgroup_matrix_config.M;
-                wg_m = CEIL_DIV(dst->ne[0], wg_m_sg_tile);
-                uint32_t wg_n_sg_tile =
-                    WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
-                wg_n = CEIL_DIV(dst->ne[1], wg_n_sg_tile);
+                uint32_t wg_m_sg_tile = WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->sg_mat_m;
+                wg_m                  = CEIL_DIV(dst->ne[0], wg_m_sg_tile);
+                uint32_t wg_n_sg_tile = WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->sg_mat_n;
+                wg_n                  = CEIL_DIV(dst->ne[1], wg_n_sg_tile);
            } else {
 #endif
                uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
@ -986,6 +1026,146 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }

+static webgpu_command ggml_webgpu_flash_attn(webgpu_context & ctx,
+                                             ggml_tensor *    Q,
+                                             ggml_tensor *    K,
+                                             ggml_tensor *    V,
+                                             ggml_tensor *    mask,
+                                             ggml_tensor *    sinks,
+                                             ggml_tensor *    dst) {
+    float scale = *(float *) dst->op_params;
+    float max_bias;
+    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+    float logit_softcap;
+    memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
+    if (logit_softcap != 0.0f) {
+        scale /= logit_softcap;
+    }
+    float n_head_log2 = float(1u << (uint32_t) floor(log2(Q->ne[2])));
+    float m0          = powf(2.0f, -(max_bias) / n_head_log2);
+    float m1          = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    const int has_mask  = (mask != nullptr);
+    const int has_sinks = (sinks != nullptr);
+
+    std::vector<uint32_t> params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, Q) / ggml_type_size(Q->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type)),
+        has_mask ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)) : 0,
+        has_sinks ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, sinks) / ggml_type_size(sinks->type)) : 0,
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) Q->ne[2],                              // number of heads
+        (uint32_t) Q->ne[1],                              // sequence length (Q)
+        (uint32_t) K->ne[1],                              // sequence length (K/V)
+        (uint32_t) (Q->nb[1] / ggml_type_size(Q->type)),  // stride (elements/blocks) of Q in dimension 1
+        (uint32_t) (Q->nb[2] / ggml_type_size(Q->type)),  // stride (elements/blocks) of Q in dimension 2
+        (uint32_t) (Q->nb[3] / ggml_type_size(Q->type)),  // stride (elements/blocks) of Q in dimension 3
+        (uint32_t) (K->nb[1] / ggml_type_size(K->type)),  // stride (elements/blocks) of K in dimension 1
+        (uint32_t) (K->nb[2] / ggml_type_size(K->type)),  // stride (elements/blocks) of K in dimension 2
+        (uint32_t) (K->nb[3] / ggml_type_size(K->type)),  // stride (elements/blocks) of K in dimension 3
+        (uint32_t) (V->nb[1] / ggml_type_size(V->type)),  // stride (elements/blocks) of V in dimension 1
+        (uint32_t) (V->nb[2] / ggml_type_size(V->type)),  // stride (elements/blocks) of V in dimension 2
+        (uint32_t) (V->nb[3] / ggml_type_size(V->type)),  // stride (elements/blocks) of V in dimension 3
+        has_mask ? (uint32_t) (mask->nb[3] / ggml_type_size(mask->type)) : 0,  // stride of mask dim 3
+        (uint32_t) (Q->ne[2] / K->ne[2]),  // repeat factor for K/V in dim 2 (MHA/MQA/GQA)
+        *(uint32_t *) &scale,              // scale (possibly adjusted for logit softcap)
+        *(uint32_t *) &max_bias,
+        *(uint32_t *) &logit_softcap,
+        *(uint32_t *) &n_head_log2,
+        *(uint32_t *) &m0,
+        *(uint32_t *) &m1
+
+    };
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0,
+         .buffer  = ggml_webgpu_tensor_buf(Q),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, Q),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, Q) },
+        { .binding = 1,
+         .buffer  = ggml_webgpu_tensor_buf(K),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, K),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, K) },
+        { .binding = 2,
+         .buffer  = ggml_webgpu_tensor_buf(V),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, V),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, V) }
+    };
+    uint32_t binding_index = 3;
+    if (has_mask) {
+        entries.push_back({ .binding = binding_index++,
+                            .buffer  = ggml_webgpu_tensor_buf(mask),
+                            .offset  = ggml_webgpu_tensor_align_offset(ctx, mask),
+                            .size    = ggml_webgpu_tensor_binding_size(ctx, mask) });
+    }
+    if (has_sinks) {
+        entries.push_back({ .binding = binding_index++,
+                            .buffer  = ggml_webgpu_tensor_buf(sinks),
+                            .offset  = ggml_webgpu_tensor_align_offset(ctx, sinks),
+                            .size    = ggml_webgpu_tensor_binding_size(ctx, sinks) });
+    }
+    entries.push_back({ .binding = binding_index++,
+                        .buffer  = ggml_webgpu_tensor_buf(dst),
+                        .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
+                        .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
+
+    bool kv_direct =
+        (K->type == GGML_TYPE_F16) && (Q->ne[0] % ctx->sg_mat_k == 0) && (K->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
+
+    flash_attn_pipeline_key key = {
+        .q_type             = Q->type,
+        .kv_type            = K->type,
+        .dst_type           = dst->type,
+        .head_dim_qk        = (uint32_t) Q->ne[0],
+        .head_dim_v         = (uint32_t) V->ne[0],
+        .kv_direct          = kv_direct,
+        .has_mask           = static_cast<bool>(has_mask),
+        .has_sinks          = static_cast<bool>(has_sinks),
+        .uses_logit_softcap = logit_softcap != 0.0f,
+    };
+
+    webgpu_pipeline                         pipeline;
+    ggml_webgpu_flash_attn_shader_decisions decisions = {};
+
+    auto it = ctx->flash_attn_pipelines.find(key);
+    if (it != ctx->flash_attn_pipelines.end()) {
+        pipeline  = it->second;
+        decisions = *static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context);
+    } else {
+        std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
+        it = ctx->flash_attn_pipelines.find(key);
+        if (it != ctx->flash_attn_pipelines.end()) {
+            pipeline  = it->second;
+            decisions = *static_cast<ggml_webgpu_flash_attn_shader_decisions *>(pipeline.context);
+        } else {
+            ggml_webgpu_flash_attn_shader_lib_context shader_lib_ctx = { .kv_type     = K->type,
+                                                                         .head_dim_qk = (uint32_t) Q->ne[0],
+                                                                         .head_dim_v  = (uint32_t) V->ne[0],
+                                                                         .kv_direct   = kv_direct,
+                                                                         .has_mask    = static_cast<bool>(has_mask),
+                                                                         .has_sinks   = static_cast<bool>(has_sinks),
+                                                                         .uses_logit_softcap = logit_softcap != 0.0f,
+                                                                         .sg_mat_m           = ctx->sg_mat_m,
+                                                                         .sg_mat_n           = ctx->sg_mat_n,
+                                                                         .sg_mat_k           = ctx->sg_mat_k,
+                                                                         .wg_mem_limit_bytes =
+                                                                             ctx->limits.maxComputeWorkgroupStorageSize,
+                                                                         .max_subgroup_size = ctx->max_subgroup_size };
+
+            ggml_webgpu_processed_shader processed =
+                ggml_webgpu_preprocess_flash_attn_shader(ctx->p, wgsl_flash_attn, shader_lib_ctx);
+            pipeline = ggml_webgpu_create_pipeline(ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+            pipeline.context = new ggml_webgpu_flash_attn_shader_decisions(processed.decisions);
+            ctx->flash_attn_pipelines.emplace(key, pipeline);
+            decisions = processed.decisions;
+        }
+    }
+
+    uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions.q_tile);
+    uint32_t wg_x        = wg_per_head * Q->ne[2] * Q->ne[3];  // wg per head * number of heads * number of batches
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
+}
+
 static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
    uint32_t      ne       = (uint32_t) ggml_nelements(dst);
    ggml_unary_op unary_op = ggml_get_unary_op(dst);
@ -1397,6 +1577,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
            return ggml_webgpu_get_rows(ctx, src0, src1, node);
        case GGML_OP_MUL_MAT:
            return ggml_webgpu_mul_mat(ctx, src0, src1, node);
+        case GGML_OP_FLASH_ATTN_EXT:
+            return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
        case GGML_OP_ADD:
            {
                int inplace = ggml_webgpu_tensor_equal(src0, node);
@ -1466,6 +1648,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
        webgpu_submission_futures new_futures = ggml_backend_webgpu_submit(ctx, commands);
        futures.push_back(new_futures);
    }
+
    ggml_backend_webgpu_wait(ctx, futures);
    ctx->inflight_threads--;
    WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
@ -1698,9 +1881,18 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_

 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory.
-    *free                                    = ctx->webgpu_ctx->limits.maxBufferSize;
-    *total                                   = ctx->webgpu_ctx->limits.maxBufferSize;
+    // TODO: for now, return maxBufferSize as both free and total memory
+    // Track https://github.com/gpuweb/gpuweb/issues/5505 for updates.
+    uint64_t max_buffer_size = ctx->webgpu_ctx->limits.maxBufferSize;
+    // If we're on a 32-bit system, clamp to UINTPTR_MAX
+#if UINTPTR_MAX < UINT64_MAX
+    uint64_t max_ptr_size = static_cast<uint64_t>(UINTPTR_MAX);
+    if (max_buffer_size > max_ptr_size) {
+        max_buffer_size = max_ptr_size;
+    }
+#endif
+    *free  = static_cast<size_t>(max_buffer_size);
+    *total = static_cast<size_t>(max_buffer_size);
 }

 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
@ -1808,15 +2000,15 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
 #ifndef __EMSCRIPTEN__
    if (webgpu_ctx->supports_subgroup_matrix) {
        std::map<std::string, std::string> sg_matrix_repls;
-        sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
+        sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->max_subgroup_size);
        sg_matrix_repls["WEBGPU_TILE_K"]            = std::to_string(WEBGPU_MUL_MAT_TILE_K);
        sg_matrix_repls["WEBGPU_SUBGROUP_M"]        = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M);
        sg_matrix_repls["WEBGPU_SUBGROUP_N"]        = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N);
        sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M);
        sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N);
-        sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.M);
-        sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
-        sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
+        sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"]     = std::to_string(webgpu_ctx->sg_mat_m);
+        sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->sg_mat_n);
+        sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->sg_mat_k);

        proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
        proc_mul_mat_f32_f32_vec =
@ -2328,6 +2520,7 @@ static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
        webgpu_ctx->device, wgsl_soft_max_f32_mask_f16_sink_inplace, "soft_max_f32_mask_f16_sink_inplace", constants);
 }

+// TODO: move most initialization logic here
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
    GGML_UNUSED(params);

@ -2489,6 +2682,29 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                }
                break;
            }
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                if (!webgpu_ctx->supports_subgroup_matrix) {
+                    break;
+                }
+                // Head dimensions must fit in workgroup memory with minimum tile sizes
+                size_t     limit_bytes = webgpu_ctx->limits.maxComputeWorkgroupStorageSize;
+                const bool has_mask    = op->src[3] != nullptr;
+                const bool kv_direct   = src1->type == GGML_TYPE_F16 && (src0->ne[0] % webgpu_ctx->sg_mat_k) == 0 &&
+                                       (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
+                const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
+                    webgpu_ctx->sg_mat_m, webgpu_ctx->sg_mat_n, (uint32_t) src0->ne[0], (uint32_t) src2->ne[0],
+                    has_mask, kv_direct);
+                if (min_bytes > limit_bytes) {
+                    break;
+                }
+
+                supports_op = src0->type == GGML_TYPE_F32 &&
+                              (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 ||
+                               src1->type == GGML_TYPE_Q4_0 || src1->type == GGML_TYPE_Q8_0) &&
+                              src2->type == src1->type && op->type == GGML_TYPE_F32;
+                break;
+            }
        case GGML_OP_RMS_NORM:
            supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
            break;
@ -2606,6 +2822,7 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
 }

 // TODO: Does this need to be thread safe? Is it only called once?
+// TODO: move most logic to device_init function so backend can be freed/initialized properly
 // Only one device is supported for now
 static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
    GGML_ASSERT(index == 0);
@ -2665,7 +2882,9 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
            if (config.M == config.N && config.N == config.K && (config.K == 8 || config.K == 16) &&
                config.componentType == wgpu::SubgroupMatrixComponentType::F16 &&
                config.resultComponentType == wgpu::SubgroupMatrixComponentType::F16) {
-                ctx->subgroup_matrix_config  = config;
+                ctx->sg_mat_m                = config.M;
+                ctx->sg_mat_n                = config.N;
+                ctx->sg_mat_k                = config.K;
                valid_subgroup_matrix_config = true;
                break;
            }
@ -2676,7 +2895,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 #endif
    // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
    // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
-    ctx->subgroup_size = info.subgroupMaxSize;
+    ctx->max_subgroup_size = info.subgroupMaxSize;

    // Initialize device
    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
@ -2701,8 +2920,11 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
        wgpu::CallbackMode::AllowSpontaneous,
        [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
            GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
-                           std::string(message).c_str());
+            GGML_UNUSED(reason);
+            GGML_UNUSED(message);
+            //TODO: uncomment once proper free logic is in place
+            //GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
+            //std::string(message).c_str());
        });
    dev_desc.SetUncapturedErrorCallback(
        [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
--- a/ggml/src/ggml-webgpu/pre_wgsl.hpp
+++ b/ggml/src/ggml-webgpu/pre_wgsl.hpp
@ -0,0 +1,778 @@
+#ifndef PRE_WGSL_HPP
+#define PRE_WGSL_HPP
+
+#include <cctype>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace pre_wgsl {
+
+//==============================================================
+// Options
+//==============================================================
+struct Options {
+    std::string              include_path = ".";
+    std::vector<std::string> macros;
+};
+
+//==============================================================
+// Utility: trim
+//==============================================================
+static std::string trim(const std::string & s) {
+    size_t a = 0;
+    while (a < s.size() && std::isspace((unsigned char) s[a])) {
+        a++;
+    }
+    size_t b = s.size();
+    while (b > a && std::isspace((unsigned char) s[b - 1])) {
+        b--;
+    }
+    return s.substr(a, b - a);
+}
+
+static std::string trim_value(std::istream & is) {
+    std::string str;
+    std::getline(is, str);
+    return trim(str);
+}
+
+static bool isIdentChar(char c) {
+    return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
+}
+
+static std::string expandMacrosRecursiveInternal(const std::string &                                  line,
+                                                 const std::unordered_map<std::string, std::string> & macros,
+                                                 std::unordered_set<std::string> &                    visiting);
+
+static std::string expandMacroValue(const std::string &                                  name,
+                                    const std::unordered_map<std::string, std::string> & macros,
+                                    std::unordered_set<std::string> &                    visiting) {
+    if (visiting.count(name)) {
+        throw std::runtime_error("Recursive macro: " + name);
+    }
+    visiting.insert(name);
+
+    auto it = macros.find(name);
+    if (it == macros.end()) {
+        visiting.erase(name);
+        return name;
+    }
+
+    const std::string & value = it->second;
+    if (value.empty()) {
+        visiting.erase(name);
+        return "";
+    }
+
+    std::string expanded = expandMacrosRecursiveInternal(value, macros, visiting);
+    visiting.erase(name);
+    return expanded;
+}
+
+static std::string expandMacrosRecursiveInternal(const std::string &                                  line,
+                                                 const std::unordered_map<std::string, std::string> & macros,
+                                                 std::unordered_set<std::string> &                    visiting) {
+    std::string result;
+    result.reserve(line.size());
+
+    size_t i = 0;
+    while (i < line.size()) {
+        if (isIdentChar(line[i])) {
+            size_t start = i;
+            while (i < line.size() && isIdentChar(line[i])) {
+                i++;
+            }
+            std::string token = line.substr(start, i - start);
+
+            auto it = macros.find(token);
+            if (it != macros.end()) {
+                result += expandMacroValue(token, macros, visiting);
+            } else {
+                result += token;
+            }
+        } else {
+            result += line[i];
+            i++;
+        }
+    }
+
+    return result;
+}
+
+static std::string expandMacrosRecursive(const std::string &                                  line,
+                                         const std::unordered_map<std::string, std::string> & macros) {
+    std::unordered_set<std::string> visiting;
+    return expandMacrosRecursiveInternal(line, macros, visiting);
+}
+
+//==============================================================
+// Tokenizer for expressions in #if/#elif
+//==============================================================
+class ExprLexer {
+  public:
+    enum Kind { END, IDENT, NUMBER, OP, LPAREN, RPAREN };
+
+    struct Tok {
+        Kind        kind;
+        std::string text;
+    };
+
+    explicit ExprLexer(std::string_view sv) : src(sv), pos(0) {}
+
+    Tok next() {
+        skipWS();
+        if (pos >= src.size()) {
+            return { END, "" };
+        }
+
+        char c = src[pos];
+
+        // number
+        if (std::isdigit((unsigned char) c)) {
+            size_t start = pos;
+            while (pos < src.size() && std::isdigit((unsigned char) src[pos])) {
+                pos++;
+            }
+            return { NUMBER, std::string(src.substr(start, pos - start)) };
+        }
+
+        // identifier
+        if (std::isalpha((unsigned char) c) || c == '_') {
+            size_t start = pos;
+            while (pos < src.size() && (std::isalnum((unsigned char) src[pos]) || src[pos] == '_')) {
+                pos++;
+            }
+            return { IDENT, std::string(src.substr(start, pos - start)) };
+        }
+
+        if (c == '(') {
+            pos++;
+            return { LPAREN, "(" };
+        }
+        if (c == ')') {
+            pos++;
+            return { RPAREN, ")" };
+        }
+
+        // multi-char operators
+        static const char * two_ops[] = { "==", "!=", "<=", ">=", "&&", "||", "<<", ">>" };
+        for (auto op : two_ops) {
+            if (src.substr(pos, 2) == op) {
+                pos += 2;
+                return { OP, std::string(op) };
+            }
+        }
+
+        // single-char operators
+        if (std::string("+-*/%<>!").find(c) != std::string::npos) {
+            pos++;
+            return { OP, std::string(1, c) };
+        }
+
+        // unexpected
+        pos++;
+        return { END, "" };
+    }
+
+  private:
+    std::string_view src;
+    size_t           pos;
+
+    void skipWS() {
+        while (pos < src.size() && std::isspace((unsigned char) src[pos])) {
+            pos++;
+        }
+    }
+};
+
+//==============================================================
+// Expression Parser (recursive descent)
+//==============================================================
+class ExprParser {
+  public:
+    ExprParser(std::string_view                                     expr,
+               const std::unordered_map<std::string, std::string> & macros,
+               std::unordered_set<std::string> &                    visiting) :
+        lex(expr),
+        macros(macros),
+        visiting(visiting) {
+        advance();
+    }
+
+    int parse() { return parseLogicalOr(); }
+
+  private:
+    ExprLexer                                            lex;
+    ExprLexer::Tok                                       tok;
+    const std::unordered_map<std::string, std::string> & macros;
+    std::unordered_set<std::string> &                    visiting;
+
+    void advance() { tok = lex.next(); }
+
+    bool acceptOp(const std::string & s) {
+        if (tok.kind == ExprLexer::OP && tok.text == s) {
+            advance();
+            return true;
+        }
+        return false;
+    }
+
+    bool acceptKind(ExprLexer::Kind k) {
+        if (tok.kind == k) {
+            advance();
+            return true;
+        }
+        return false;
+    }
+
+    int parseLogicalOr() {
+        int v = parseLogicalAnd();
+        while (acceptOp("||")) {
+            int rhs = parseLogicalAnd();
+            v       = (v || rhs);
+        }
+        return v;
+    }
+
+    int parseLogicalAnd() {
+        int v = parseEquality();
+        while (acceptOp("&&")) {
+            int rhs = parseEquality();
+            v       = (v && rhs);
+        }
+        return v;
+    }
+
+    int parseEquality() {
+        int v = parseRelational();
+        for (;;) {
+            if (acceptOp("==")) {
+                int rhs = parseRelational();
+                v       = (v == rhs);
+            } else if (acceptOp("!=")) {
+                int rhs = parseRelational();
+                v       = (v != rhs);
+            } else {
+                break;
+            }
+        }
+        return v;
+    }
+
+    int parseRelational() {
+        int v = parseShift();
+        for (;;) {
+            if (acceptOp("<")) {
+                int rhs = parseShift();
+                v       = (v < rhs);
+            } else if (acceptOp(">")) {
+                int rhs = parseShift();
+                v       = (v > rhs);
+            } else if (acceptOp("<=")) {
+                int rhs = parseShift();
+                v       = (v <= rhs);
+            } else if (acceptOp(">=")) {
+                int rhs = parseShift();
+                v       = (v >= rhs);
+            } else {
+                break;
+            }
+        }
+        return v;
+    }
+
+    int parseShift() {
+        int v = parseAdd();
+        for (;;) {
+            if (acceptOp("<<")) {
+                int rhs = parseAdd();
+                v       = (v << rhs);
+            } else if (acceptOp(">>")) {
+                int rhs = parseAdd();
+                v       = (v >> rhs);
+            } else {
+                break;
+            }
+        }
+        return v;
+    }
+
+    int parseAdd() {
+        int v = parseMult();
+        for (;;) {
+            if (acceptOp("+")) {
+                int rhs = parseMult();
+                v       = (v + rhs);
+            } else if (acceptOp("-")) {
+                int rhs = parseMult();
+                v       = (v - rhs);
+            } else {
+                break;
+            }
+        }
+        return v;
+    }
+
+    int parseMult() {
+        int v = parseUnary();
+        for (;;) {
+            if (acceptOp("*")) {
+                int rhs = parseUnary();
+                v       = (v * rhs);
+            } else if (acceptOp("/")) {
+                int rhs = parseUnary();
+                v       = (rhs == 0 ? 0 : v / rhs);
+            } else if (acceptOp("%")) {
+                int rhs = parseUnary();
+                v       = (rhs == 0 ? 0 : v % rhs);
+            } else {
+                break;
+            }
+        }
+        return v;
+    }
+
+    int parseUnary() {
+        if (acceptOp("!")) {
+            return !parseUnary();
+        }
+        if (acceptOp("-")) {
+            return -parseUnary();
+        }
+        if (acceptOp("+")) {
+            return +parseUnary();
+        }
+        return parsePrimary();
+    }
+
+    int parsePrimary() {
+        // '(' expr ')'
+        if (acceptKind(ExprLexer::LPAREN)) {
+            int v = parse();
+            if (!acceptKind(ExprLexer::RPAREN)) {
+                throw std::runtime_error("missing ')'");
+            }
+            return v;
+        }
+
+        // number
+        if (tok.kind == ExprLexer::NUMBER) {
+            int v = std::stoi(tok.text);
+            advance();
+            return v;
+        }
+
+        // defined(identifier)
+        if (tok.kind == ExprLexer::IDENT && tok.text == "defined") {
+            advance();
+            if (acceptKind(ExprLexer::LPAREN)) {
+                if (tok.kind != ExprLexer::IDENT) {
+                    throw std::runtime_error("expected identifier in defined()");
+                }
+                std::string name = tok.text;
+                advance();
+                if (!acceptKind(ExprLexer::RPAREN)) {
+                    throw std::runtime_error("missing ) in defined()");
+                }
+                return macros.count(name) ? 1 : 0;
+            } else {
+                // defined NAME
+                if (tok.kind != ExprLexer::IDENT) {
+                    throw std::runtime_error("expected identifier in defined NAME");
+                }
+                std::string name = tok.text;
+                advance();
+                return macros.count(name) ? 1 : 0;
+            }
+        }
+
+        // identifier -> treat as integer, if defined use its value else 0
+        if (tok.kind == ExprLexer::IDENT) {
+            std::string name = tok.text;
+            advance();
+            auto it = macros.find(name);
+            if (it == macros.end()) {
+                return 0;
+            }
+            if (it->second.empty()) {
+                return 1;
+            }
+            return evalMacroExpression(name, it->second);
+        }
+
+        // unexpected
+        return 0;
+    }
+
+    int evalMacroExpression(const std::string & name, const std::string & value) {
+        if (visiting.count(name)) {
+            throw std::runtime_error("Recursive macro: " + name);
+        }
+
+        visiting.insert(name);
+        ExprParser ep(value, macros, visiting);
+        int        v = ep.parse();
+        visiting.erase(name);
+        return v;
+    }
+};
+
+//==============================================================
+// Preprocessor
+//==============================================================
+class Preprocessor {
+  public:
+    explicit Preprocessor(Options opts = {}) : opts_(std::move(opts)) {
+        // Treat empty include path as current directory
+        if (opts_.include_path.empty()) {
+            opts_.include_path = ".";
+        }
+        parseMacroDefinitions(opts_.macros);
+    }
+
+    std::string preprocess_file(const std::string & filename, const std::vector<std::string> & additional_macros = {}) {
+        std::unordered_map<std::string, std::string> macros;
+        std::unordered_set<std::string>              predefined;
+        std::unordered_set<std::string>              include_stack;
+        buildMacros(additional_macros, macros, predefined);
+
+        std::string result = processFile(filename, macros, predefined, include_stack, DirectiveMode::All);
+        return result;
+    }
+
+    std::string preprocess(const std::string & contents, const std::vector<std::string> & additional_macros = {}) {
+        std::unordered_map<std::string, std::string> macros;
+        std::unordered_set<std::string>              predefined;
+        std::unordered_set<std::string>              include_stack;
+        buildMacros(additional_macros, macros, predefined);
+
+        std::string result = processString(contents, macros, predefined, include_stack, DirectiveMode::All);
+        return result;
+    }
+
+    std::string preprocess_includes_file(const std::string & filename) {
+        std::unordered_map<std::string, std::string> macros;
+        std::unordered_set<std::string>              predefined;
+        std::unordered_set<std::string>              include_stack;
+        std::string result = processFile(filename, macros, predefined, include_stack, DirectiveMode::IncludesOnly);
+        return result;
+    }
+
+    std::string preprocess_includes(const std::string & contents) {
+        std::unordered_map<std::string, std::string> macros;
+        std::unordered_set<std::string>              predefined;
+        std::unordered_set<std::string>              include_stack;
+        std::string result = processString(contents, macros, predefined, include_stack, DirectiveMode::IncludesOnly);
+        return result;
+    }
+
+  private:
+    Options                                      opts_;
+    std::unordered_map<std::string, std::string> global_macros;
+
+    enum class DirectiveMode { All, IncludesOnly };
+
+    struct Cond {
+        bool parent_active;
+        bool active;
+        bool taken;
+    };
+
+    //----------------------------------------------------------
+    // Parse macro definitions into global_macros
+    //----------------------------------------------------------
+    void parseMacroDefinitions(const std::vector<std::string> & macro_defs) {
+        for (const auto & def : macro_defs) {
+            size_t eq_pos = def.find('=');
+            if (eq_pos != std::string::npos) {
+                // Format: NAME=VALUE
+                std::string name    = trim(def.substr(0, eq_pos));
+                std::string value   = trim(def.substr(eq_pos + 1));
+                global_macros[name] = value;
+            } else {
+                // Format: NAME
+                std::string name    = trim(def);
+                global_macros[name] = "";
+            }
+        }
+    }
+
+    //----------------------------------------------------------
+    // Build combined macro map and predefined set for a preprocessing operation
+    //----------------------------------------------------------
+    void buildMacros(const std::vector<std::string> &               additional_macros,
+                     std::unordered_map<std::string, std::string> & macros,
+                     std::unordered_set<std::string> &              predefined) {
+        macros = global_macros;
+        predefined.clear();
+
+        for (const auto & [name, value] : global_macros) {
+            predefined.insert(name);
+        }
+
+        for (const auto & def : additional_macros) {
+            size_t      eq_pos = def.find('=');
+            std::string name, value;
+            if (eq_pos != std::string::npos) {
+                name  = trim(def.substr(0, eq_pos));
+                value = trim(def.substr(eq_pos + 1));
+            } else {
+                name  = trim(def);
+                value = "";
+            }
+
+            // Add to macros map (will override global if same name)
+            macros[name] = value;
+            predefined.insert(name);
+        }
+    }
+
+    //----------------------------------------------------------
+    // Helpers
+    //----------------------------------------------------------
+    std::string loadFile(const std::string & fname) {
+        std::ifstream f(fname);
+        if (!f.is_open()) {
+            throw std::runtime_error("Could not open file: " + fname);
+        }
+        std::stringstream ss;
+        ss << f.rdbuf();
+        return ss.str();
+    }
+
+    bool condActive(const std::vector<Cond> & cond) const {
+        if (cond.empty()) {
+            return true;
+        }
+        return cond.back().active;
+    }
+
+    //----------------------------------------------------------
+    // Process a file
+    //----------------------------------------------------------
+    std::string processFile(const std::string &                            name,
+                            std::unordered_map<std::string, std::string> & macros,
+                            const std::unordered_set<std::string> &        predefined_macros,
+                            std::unordered_set<std::string> &              include_stack,
+                            DirectiveMode                                  mode) {
+        if (include_stack.count(name)) {
+            throw std::runtime_error("Recursive include: " + name);
+        }
+
+        include_stack.insert(name);
+        std::string shader_code = loadFile(name);
+        std::string out         = processString(shader_code, macros, predefined_macros, include_stack, mode);
+        include_stack.erase(name);
+        return out;
+    }
+
+    std::string processIncludeFile(const std::string &                            fname,
+                                   std::unordered_map<std::string, std::string> & macros,
+                                   const std::unordered_set<std::string> &        predefined_macros,
+                                   std::unordered_set<std::string> &              include_stack,
+                                   DirectiveMode                                  mode) {
+        std::string full_path = opts_.include_path + "/" + fname;
+        return processFile(full_path, macros, predefined_macros, include_stack, mode);
+    }
+
+    //----------------------------------------------------------
+    // Process text
+    //----------------------------------------------------------
+    std::string processString(const std::string &                            shader_code,
+                              std::unordered_map<std::string, std::string> & macros,
+                              const std::unordered_set<std::string> &        predefined_macros,
+                              std::unordered_set<std::string> &              include_stack,
+                              DirectiveMode                                  mode) {
+        std::vector<Cond>  cond;  // Conditional stack for this shader
+        std::stringstream  out;
+        std::istringstream in(shader_code);
+        std::string        line;
+
+        while (std::getline(in, line)) {
+            std::string t = trim(line);
+
+            if (!t.empty() && t[0] == '#') {
+                bool handled = handleDirective(t, out, macros, predefined_macros, cond, include_stack, mode);
+                if (mode == DirectiveMode::IncludesOnly && !handled) {
+                    out << line << "\n";
+                }
+            } else {
+                if (mode == DirectiveMode::IncludesOnly) {
+                    out << line << "\n";
+                } else if (condActive(cond)) {
+                    // Expand macros in the line before outputting
+                    std::string expanded = expandMacrosRecursive(line, macros);
+                    out << expanded << "\n";
+                }
+            }
+        }
+
+        if (mode == DirectiveMode::All && !cond.empty()) {
+            throw std::runtime_error("Unclosed #if directive");
+        }
+
+        return out.str();
+    }
+
+    //----------------------------------------------------------
+    // Directive handler
+    //----------------------------------------------------------
+    bool handleDirective(const std::string &                            t,
+                         std::stringstream &                            out,
+                         std::unordered_map<std::string, std::string> & macros,
+                         const std::unordered_set<std::string> &        predefined_macros,
+                         std::vector<Cond> &                            cond,
+                         std::unordered_set<std::string> &              include_stack,
+                         DirectiveMode                                  mode) {
+        // split into tokens
+        std::string        body = t.substr(1);
+        std::istringstream iss(body);
+        std::string        cmd;
+        iss >> cmd;
+
+        if (cmd == "include") {
+            if (mode == DirectiveMode::All && !condActive(cond)) {
+                return true;
+            }
+            std::string file;
+            iss >> file;
+            if (file.size() >= 2 && file.front() == '"' && file.back() == '"') {
+                file = file.substr(1, file.size() - 2);
+            }
+            out << processIncludeFile(file, macros, predefined_macros, include_stack, mode);
+            return true;
+        }
+
+        if (mode == DirectiveMode::IncludesOnly) {
+            return false;
+        }
+
+        if (cmd == "define") {
+            if (!condActive(cond)) {
+                return true;
+            }
+            std::string name;
+            iss >> name;
+            // Don't override predefined macros from options
+            if (predefined_macros.count(name)) {
+                return true;
+            }
+            std::string value = trim_value(iss);
+            macros[name]      = value;
+            return true;
+        }
+
+        if (cmd == "undef") {
+            if (!condActive(cond)) {
+                return true;
+            }
+            std::string name;
+            iss >> name;
+            // Don't undef predefined macros from options
+            if (predefined_macros.count(name)) {
+                return true;
+            }
+            macros.erase(name);
+            return true;
+        }
+
+        if (cmd == "ifdef") {
+            std::string name;
+            iss >> name;
+            bool p = condActive(cond);
+            bool v = macros.count(name);
+            cond.push_back({ p, p && v, p && v });
+            return true;
+        }
+
+        if (cmd == "ifndef") {
+            std::string name;
+            iss >> name;
+            bool p = condActive(cond);
+            bool v = !macros.count(name);
+            cond.push_back({ p, p && v, p && v });
+            return true;
+        }
+
+        if (cmd == "if") {
+            std::string expr = trim_value(iss);
+            bool        p    = condActive(cond);
+            bool        v    = false;
+            if (p) {
+                std::unordered_set<std::string> visiting;
+                ExprParser                      ep(expr, macros, visiting);
+                v = ep.parse() != 0;
+            }
+            cond.push_back({ p, p && v, p && v });
+            return true;
+        }
+
+        if (cmd == "elif") {
+            std::string expr = trim_value(iss);
+
+            if (cond.empty()) {
+                throw std::runtime_error("#elif without #if");
+            }
+
+            Cond & c = cond.back();
+            if (!c.parent_active) {
+                c.active = false;
+                return true;
+            }
+
+            if (c.taken) {
+                c.active = false;
+                return true;
+            }
+
+            std::unordered_set<std::string> visiting;
+            ExprParser                      ep(expr, macros, visiting);
+            bool                            v = ep.parse() != 0;
+            c.active                          = v;
+            if (v) {
+                c.taken = true;
+            }
+            return true;
+        }
+
+        if (cmd == "else") {
+            if (cond.empty()) {
+                throw std::runtime_error("#else without #if");
+            }
+
+            Cond & c = cond.back();
+            if (!c.parent_active) {
+                c.active = false;
+                return true;
+            }
+            if (c.taken) {
+                c.active = false;
+            } else {
+                c.active = true;
+                c.taken  = true;
+            }
+            return true;
+        }
+
+        if (cmd == "endif") {
+            if (cond.empty()) {
+                throw std::runtime_error("#endif without #if");
+            }
+            cond.pop_back();
+            return true;
+        }
+
+        // Unknown directive
+        throw std::runtime_error("Unknown directive: #" + cmd);
+    }
+};
+
+}  // namespace pre_wgsl
+
+#endif  // PRE_WGSL_HPP
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
@ -0,0 +1,591 @@
+diagnostic(off, chromium.subgroup_matrix_uniformity);
+diagnostic(off, subgroup_uniformity);
+enable f16;
+enable subgroups;
+enable chromium_experimental_subgroup_matrix;
+
+#ifdef KV_F32
+#define KV_TYPE f32
+#else
+#define KV_TYPE f16
+#endif
+
+// Default values
+#define HEAD_DIM_QK 64
+#define HEAD_DIM_V 64
+
+// The number of rows/columns/k in a subgroup matrix. MxK * KxN = MxN
+// Note that the "K" here does not correspond to the K in attention's Q/K/V, it's just the common dimension.
+#define SG_MAT_M 8
+#define SG_MAT_N 8
+#define SG_MAT_K 8
+
+// Each workgroup processes one subgroup matrix of Q rows
+#define Q_TILE SG_MAT_M
+#define KV_TILE 16
+#define WG_SIZE 64
+
+// Number of subgroup-matrix-width blocks that span the KV tile. SG_MAT_N must divide KV_TILE.
+#define KV_BLOCKS (KV_TILE / SG_MAT_N)
+
+// Quantization constants/helpers
+#define BLOCK_SIZE 32
+#define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
+#define BLOCKS_V ((HEAD_DIM_V + BLOCK_SIZE - 1) / BLOCK_SIZE)
+// number of quantized elements processed per thread
+#if defined(KV_Q4_0)
+#define NQ 16
+// Q4_0 has 32 elements, 1 f16 for scale, 8 f16 for 4-bit weights
+#define F16_PER_BLOCK 9
+#define WEIGHTS_PER_F16 4
+#elif defined(KV_Q8_0)
+#define NQ 8
+// Q8_0 has 32 elements, 1 f16 for scale, 16 f16 for 8-bit weights
+#define F16_PER_BLOCK 17
+#define WEIGHTS_PER_F16 2
+#endif
+#define F16_PER_THREAD (NQ / WEIGHTS_PER_F16)
+
+// Ok not to put these in a define block, compiler will remove if unused
+fn get_byte(value: u32, index: u32) -> u32 {
+    return (value >> (index * 8)) & 0xFF;
+}
+
+fn get_byte_i32(value: u32, index: u32) -> i32 {
+    return bitcast<i32>(((value >> (index * 8)) & 0xFF) << 24) >> 24;
+}
+
+struct Params {
+    offset_q: u32,
+    offset_k: u32,
+    offset_v: u32,
+    offset_mask: u32,
+    offset_sinks: u32,
+    offset_dst: u32,
+
+    // shapes of Q/K/V
+    n_heads: u32,
+    seq_len_q: u32,
+    seq_len_kv: u32,
+
+    // strides (in elements)
+    stride_q1: u32,
+    stride_q2: u32,
+    stride_q3: u32,
+    stride_k1: u32,
+    stride_k2: u32,
+    stride_k3: u32,
+    stride_v1: u32,
+    stride_v2: u32,
+    stride_v3: u32,
+    stride_mask3: u32,
+
+    // repeat factors for K/V, e.g., MHA vs. MQA vs. GQA
+    q_per_kv: u32,
+
+    // softmax params
+    scale: f32,
+    max_bias: f32,
+    logit_softcap: f32,
+    n_head_log2: f32,
+    m0: f32,
+    m1: f32,
+};
+
+@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
+@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
+@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
+
+#if defined(MASK) && defined(SINKS)
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 5
+#define PARAMS_BINDING 6
+#elif defined(MASK)
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#elif defined(SINKS)
+@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#else
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#endif
+
+@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<f32>;
+@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
+
+// Just a very small float value.
+const FLOAT_MIN: f32 = -1.0e9;
+
+// The number of Q rows processed per workgroup
+var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
+
+#ifndef KV_DIRECT
+const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
+// we can reuse the same shmem for K and V since we only need one at a time
+var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
+#endif
+
+var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>; // output shmem
+
+#ifdef MASK
+// storage for mask values
+var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
+#endif
+
+// storage for output of Q*K^T scores for online softmax (S matrix from paper)
+// also storage for diagonal matrix during online softmax (P matrix from paper)
+// note that we reuse the same storage for both since we only need one at a time
+var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;
+
+// Storage for row max and exp sum during online softmax
+var<workgroup> row_max_shmem: array<f32, Q_TILE>;
+var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
+
+fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 {
+    var v = select(FLOAT_MIN,
+                   f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
+                   kv_idx < KV_TILE);
+#ifdef LOGIT_SOFTCAP
+    v = params.logit_softcap * tanh(v);
+#endif
+#ifdef MASK
+    let mask_val = select(0.0, f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
+    let mask_term = slope * mask_val;
+    v += mask_term;
+#endif
+    return v;
+}
+
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>,
+        @builtin(subgroup_id) subgroup_id: u32,
+        @builtin(subgroup_size) subgroup_size: u32,
+        @builtin(num_subgroups) num_subgroups: u32,
+        @builtin(subgroup_invocation_id) sg_inv_id: u32) {
+
+    // initialize row max for online softmax
+    for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
+        row_max_shmem[i] = FLOAT_MIN;
+        exp_sum_shmem[i] = 0.0;
+    }
+
+    for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
+        o_shmem[i] = 0.0;
+    }
+
+    // workgroups per head/batch
+    let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
+    let wg_per_batch = wg_per_head * params.n_heads;
+
+    let dst2_stride = HEAD_DIM_V * params.n_heads;
+    let dst3_stride = dst2_stride * params.seq_len_q;
+
+    // batch index
+    let batch_idx = wg_id.x / wg_per_batch;
+    let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
+    let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
+    let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
+    let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride;
+    let wg_in_batch = wg_id.x % wg_per_batch;
+
+    // head index
+    let head_idx = wg_in_batch / wg_per_head;
+    let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
+    let k_head_idx = head_idx / params.q_per_kv;
+    let v_head_idx = k_head_idx;
+    let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
+    let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;
+
+    // starting Q row for this workgroup
+    let wg_in_head = wg_in_batch % wg_per_head;
+    let q_row_start = wg_in_head * Q_TILE;
+
+#ifdef MASK
+    // mask offset
+    let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
+#endif
+
+    // note that the output is permuted, the layout is [head_dim_v, n_heads, seq_len_q, batch_size]
+    let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V;
+
+    let head = f32(head_idx);
+    let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), params.max_bias > 0);
+
+    // load q tile into shared memory
+    for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
+        let q_row = elem_idx / HEAD_DIM_QK;
+        let q_col = elem_idx % HEAD_DIM_QK;
+        let head_q_row = q_row_start + q_row;
+        let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
+        q_shmem[elem_idx] = f16(select(
+            0.0,
+            Q[global_q_row_offset + q_col],
+            head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
+    }
+
+    for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
+      // clear inter_shmem to ensure zero-initialized accumulators
+      for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
+          inter_shmem[elem_idx] = 0.0;
+      }
+
+      // load k tile into shared memory
+#if defined(KV_Q4_0)
+      for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) {
+          let blck_idx = elem_idx / BLOCK_SIZE;
+          let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
+          let k_row = blck_idx / BLOCKS_K;
+          let global_k_row = kv_tile + k_row;
+          let block_k = blck_idx % BLOCKS_K;
+          let row_offset = k_row * HEAD_DIM_QK;
+
+          if (global_k_row < params.seq_len_kv) {
+              let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k;
+              let base_idx = global_block_idx * F16_PER_BLOCK;
+              let d = K[base_idx]; // scale
+              for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                  let q_0 = K[base_idx + 1u + block_offset + j];
+                  let q_1 = K[base_idx + 1u + block_offset + j + 1];
+                  let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                  for (var k = 0u; k < 4u; k++) {
+                      let q_byte = get_byte(q_packed, k);
+                      let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
+                      let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
+                      let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
+                      kv_shmem[row_offset + idx] = q_lo;
+                      kv_shmem[row_offset + idx + 16u] = q_hi;
+                  }
+              }
+          }
+      }
+#elif defined(KV_Q8_0)
+      for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) {
+          let blck_idx = elem_idx / BLOCK_SIZE;
+          let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
+          let k_row = blck_idx / BLOCKS_K;
+          let global_k_row = kv_tile + k_row;
+          let block_k = blck_idx % BLOCKS_K;
+          let row_offset = k_row * HEAD_DIM_QK;
+
+          if (global_k_row < params.seq_len_kv) {
+              let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k;
+              let base_idx = global_block_idx * F16_PER_BLOCK;
+              let d = K[base_idx]; // scale
+              for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                  let q_0 = K[base_idx + 1u + block_offset + j];
+                  let q_1 = K[base_idx + 1u + block_offset + j + 1];
+                  let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                  for (var k = 0u; k < 4u; k++) {
+                      let q_byte = get_byte_i32(q_packed, k);
+                      let q_val = f16(q_byte) * d;
+                      let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
+                      kv_shmem[row_offset + idx] = q_val;
+                  }
+              }
+          }
+      }
+#elif defined(KV_DIRECT)
+      // Direct global loads for KV
+#else
+      for (var elem_idx = local_id.x; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
+          let k_row = elem_idx / HEAD_DIM_QK;
+          let k_col = elem_idx % HEAD_DIM_QK;
+          let global_k_row = kv_tile + k_row;
+          let global_k_row_offset = k_head_offset + global_k_row * params.stride_k1;
+          kv_shmem[elem_idx] = f16(select(
+              0.0,
+              K[global_k_row_offset + k_col],
+              global_k_row < params.seq_len_kv && k_col < HEAD_DIM_QK));
+      }
+#endif
+
+      workgroupBarrier();
+
+      // accumulate q block * k block into registers across the entire KV tile
+      // TODO: this loop seems to be the current largest bottleneck
+      for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
+          let inter_offset = kv_block * SG_MAT_N;
+          var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<
+              subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
+#ifdef KV_DIRECT
+          let k_block_row = kv_tile + kv_block * SG_MAT_N;
+          let k_global_offset = k_head_offset + k_block_row * params.stride_k1;
+#else
+          let k_block_offset = kv_block * SG_MAT_N * HEAD_DIM_QK;
+#endif
+          for (var head_dim_block = 0u; head_dim_block < HEAD_DIM_QK; head_dim_block += SG_MAT_K) {
+              // load q submatrix from shared memory
+              var q_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
+                  &q_shmem,
+                  head_dim_block,
+                  false,
+                  HEAD_DIM_QK
+              );
+
+              // load k submatrix from device or shared memory
+#ifdef KV_DIRECT
+              var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
+                  &K,
+                  k_global_offset + head_dim_block,
+                  true,
+                  params.stride_k1
+              );
+#else
+              var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
+                  &kv_shmem,
+                  k_block_offset + head_dim_block,
+                  true,
+                  HEAD_DIM_QK
+              );
+#endif
+              acc = subgroupMatrixMultiplyAccumulate(q_sg_mat, k_sg_mat, acc);
+          }
+
+          // store acc to shared memory for softmax (S matrix from paper)
+          subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
+      }
+
+#ifdef MASK
+      // load mask tile into shared memory for this KV block
+      // TODO: optimize and skip if mask is -INF for the entire tile
+      for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
+          let mask_row = elem_idx / KV_TILE;
+          let mask_col = elem_idx % KV_TILE;
+          let global_q_row = q_row_start + mask_row;
+          let global_k_col = kv_tile + mask_col;
+          let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
+          let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
+          mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
+      }
+#endif
+
+      workgroupBarrier();
+
+      // online softmax
+      for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
+          let global_q_row = q_row_start + q_tile_row;
+          if (global_q_row >= params.seq_len_q) {
+              break;
+          }
+
+          // initialize running max for this row
+          var prev_max = row_max_shmem[q_tile_row];
+          var final_max = prev_max;
+          // pass 1: compute final max across the full KV tile in chunks
+          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+              let kv_idx = kv_offset + sg_inv_id;
+              let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope);
+              final_max = subgroupMax(max(final_max, softmax_term));
+          }
+
+          var total_exp_term: f32 = 0.0;
+          // pass 2: compute exp sum and write P using final_max
+          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+              let kv_idx = kv_offset + sg_inv_id;
+              let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope);
+              let cur_p = select(0.0,
+                                 exp(softmax_term - final_max),
+                                 kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
+              total_exp_term += subgroupAdd(cur_p);
+              if (kv_idx < KV_TILE) {
+                  inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
+              }
+          }
+
+          let cur_exp = exp(prev_max - final_max);
+
+          if (sg_inv_id == 0) {
+              row_max_shmem[q_tile_row] = final_max;
+              exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
+          }
+
+          for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+              let idx = q_tile_row * HEAD_DIM_V + elem_idx;
+              o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
+          }
+      }
+
+      // load v tile into shared memory
+#if defined(KV_Q4_0)
+      for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) {
+          let blck_idx = elem_idx / BLOCK_SIZE;
+          let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
+          let v_row = blck_idx / BLOCKS_V;
+          let global_v_row = kv_tile + v_row;
+          let block_k = blck_idx % BLOCKS_V;
+          let row_offset = v_row * HEAD_DIM_V;
+
+          if (global_v_row < params.seq_len_kv) {
+              let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k;
+              let base_idx = global_block_idx * F16_PER_BLOCK;
+              let d = V[base_idx]; // scale
+              for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                  let q_0 = V[base_idx + 1u + block_offset + j];
+                  let q_1 = V[base_idx + 1u + block_offset + j + 1];
+                  let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                  for (var k = 0u; k < 4u; k++) {
+                      let q_byte = get_byte(q_packed, k);
+                      let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
+                      let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
+                      let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
+                      kv_shmem[row_offset + idx] = q_lo;
+                      kv_shmem[row_offset + idx + 16u] = q_hi;
+                  }
+              }
+          }
+      }
+#elif defined(KV_Q8_0)
+      for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) {
+          let blck_idx = elem_idx / BLOCK_SIZE;
+          let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
+          let v_row = blck_idx / BLOCKS_V;
+          let global_v_row = kv_tile + v_row;
+          let block_k = blck_idx % BLOCKS_V;
+          let row_offset = v_row * HEAD_DIM_V;
+
+          if (global_v_row < params.seq_len_kv) {
+              let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k;
+              let base_idx = global_block_idx * F16_PER_BLOCK;
+              let d = V[base_idx]; // scale
+              for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                  let q_0 = V[base_idx + 1u + block_offset + j];
+                  let q_1 = V[base_idx + 1u + block_offset + j + 1];
+                  let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                  for (var k = 0u; k < 4u; k++) {
+                      let q_byte = get_byte_i32(q_packed, k);
+                      let q_val = f16(q_byte) * d;
+                      let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
+                      kv_shmem[row_offset + idx] = q_val;
+                  }
+              }
+          }
+      }
+#elif defined(KV_DIRECT)
+      // Direct global loads for KV
+#else
+      for (var elem_idx = local_id.x; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE) {
+          let v_row = elem_idx / HEAD_DIM_V;
+          let v_col = elem_idx % HEAD_DIM_V;
+          let global_v_row = kv_tile + v_row;
+          let global_v_row_offset = v_head_offset + global_v_row * params.stride_v1;
+          kv_shmem[elem_idx] = f16(select(
+              0.0,
+              V[global_v_row_offset + v_col],
+              global_v_row < params.seq_len_kv && v_col < HEAD_DIM_V));
+      }
+#endif
+
+      workgroupBarrier();
+
+      // we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
+      // we want to compute O += P * V across the full KV tile
+      for (var head_dim_block = subgroup_id * SG_MAT_N;
+           head_dim_block < HEAD_DIM_V;
+           head_dim_block += num_subgroups * SG_MAT_N) {
+              // load O submatrix from shared memory
+              var o_sg_mat: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(
+                  &o_shmem,
+                  head_dim_block,
+                  false,
+                  HEAD_DIM_V
+              );
+
+              for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) {
+                  let p_offset = kv_block * SG_MAT_N;
+                  var p_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
+                      &inter_shmem,
+                      p_offset,
+                      false,
+                      KV_TILE
+                  );
+
+                  // load V submatrix from global or shared memory
+#ifdef KV_DIRECT
+                  let v_block_row = kv_tile + kv_block * SG_MAT_N;
+                  let v_global_offset = v_head_offset + v_block_row * params.stride_v1 + head_dim_block;
+                  var v_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
+                      &V,
+                      v_global_offset,
+                      false,
+                      params.stride_v1
+                  );
+#else
+                  let v_block_offset = kv_block * SG_MAT_N * HEAD_DIM_V;
+                  var v_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
+                      &kv_shmem,
+                      v_block_offset + head_dim_block,
+                      false,
+                      HEAD_DIM_V
+                  );
+#endif
+                  // O += P * V
+                  o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat);
+              }
+
+              // store O back to shared memory
+              subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V);
+      }
+
+      workgroupBarrier();
+    }
+
+#ifdef SINKS
+    // add sinks (applied once after processing all KV tiles)
+    for (var q_tile_row = subgroup_id;
+         q_tile_row < Q_TILE;
+         q_tile_row += num_subgroups) {
+            // no need to process rows beyond seq_len_q
+            let global_q_row = q_row_start + q_tile_row;
+            if (global_q_row >= params.seq_len_q) {
+                break;
+            }
+
+            var prev_max = row_max_shmem[q_tile_row];
+
+            // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
+            let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
+            let new_max = subgroupMax(max(prev_max, sink_val));
+            let max_exp = exp(prev_max - new_max);
+            let sink_exp = exp(sink_val - new_max);
+
+            let sink_exp_sum = subgroupAdd(sink_exp);
+
+            if (sg_inv_id == 0) {
+                exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
+            }
+
+            for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+                let idx = q_tile_row * HEAD_DIM_V + elem_idx;
+                let val = f32(o_shmem[idx]) * max_exp;
+                o_shmem[idx] = f16(val);
+            }
+    }
+
+    workgroupBarrier();
+#endif
+
+    // write output back to global memory
+    for (var q_tile_row = subgroup_id;
+         q_tile_row < Q_TILE;
+         q_tile_row += num_subgroups) {
+            let global_q_row = q_row_start + q_tile_row;
+            if (global_q_row >= params.seq_len_q) {
+                break;
+            }
+
+            let exp_sum = exp_sum_shmem[q_tile_row];
+            let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0);
+
+            for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+                let o_val = o_shmem[q_tile_row * HEAD_DIM_V + elem_idx];
+                let scaled = f32(o_val) * scale;
+                dst[dst_global_offset + q_tile_row * dst2_stride + elem_idx] = scaled;
+            }
+    }
+}
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -276,12 +276,13 @@ class Keys:
        DATASETS    = "imatrix.datasets"

    class Clip:
-        PROJECTOR_TYPE      = "clip.projector_type"
-        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
-        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
-        HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+        PROJECTOR_TYPE        = "clip.projector_type"
+        HAS_VISION_ENCODER    = "clip.has_vision_encoder"
+        HAS_AUDIO_ENCODER     = "clip.has_audio_encoder"
+        HAS_LLAVA_PROJECTOR   = "clip.has_llava_projector"

    class ClipVision:
+        PROJECTOR_TYPE      = "clip.vision.projector_type" # for mixed modality models
        IMAGE_SIZE          = "clip.vision.image_size"
        PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
        PATCH_SIZE          = "clip.vision.patch_size"
@ -307,6 +308,7 @@ class Keys:
            SCALE_FACTOR    = "clip.vision.projector.scale_factor"

    class ClipAudio:
+        PROJECTOR_TYPE      = "clip.audio.projector_type" # for mixed modality models
        NUM_MEL_BINS        = "clip.audio.num_mel_bins"
        EMBEDDING_LENGTH    = "clip.audio.embedding_length"
        FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
@ -465,6 +467,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
    RESAMPLER = auto()
    GLM_EDGE  = auto()
    MERGER    = auto()
+    GEMMA3N   = auto()
    GEMMA3    = auto()
    QWEN3VL   = auto()
    COGVLM    = auto()
@ -675,6 +678,15 @@ class MODEL_TENSOR(IntEnum):
    V_MM_INP_NORM        = auto()
    V_MM_INP_PROJ        = auto() # gemma3
    V_MM_SOFT_EMB_NORM   = auto() # gemma3
+    V_MM_EMBEDDING       = auto() # gemma3n
+    V_MM_HARD_EMB_NORM   = auto() # gemma3n
+    V_ENC_CONV_STEM      = auto() # gemma3n
+    V_ENC_CONV_STEM_NORM = auto() # gemma3n
+    V_ENC_MSFA_EXP       = auto() # gemma3n
+    V_ENC_MSFA_EXP_NORM  = auto() # gemma3n
+    V_ENC_MSFA_PROJ      = auto() # gemma3n
+    V_ENC_MSFA_PROJ_NORM = auto() # gemma3n
+    V_ENC_MSFA_NORM      = auto() # gemma3n
    V_RESMPL_POS_EMBD_K  = auto() # minicpmv
    V_RESMPL_ATTN_Q      = auto() # minicpmv
    V_RESMPL_ATTN_K      = auto() # minicpmv
@ -698,30 +710,41 @@ class MODEL_TENSOR(IntEnum):
    V_TOK_BOI            = auto() # cogvlm
    V_TOK_EOI            = auto() # cogvlm
    # audio (mtmd)
-    A_ENC_EMBD_POS       = auto()
-    A_ENC_EMBD_NORM      = auto()
-    A_ENC_EMBD_TO_LOGITS = auto()
-    A_ENC_CONV1D         = auto()
-    A_PRE_NORM           = auto()
-    A_POST_NORM          = auto()
-    A_ENC_ATTN_Q         = auto()
-    A_ENC_ATTN_K         = auto()
-    A_ENC_ATTN_V         = auto()
-    A_ENC_INPUT_NORM     = auto()
-    A_ENC_OUTPUT         = auto()
-    A_ENC_OUTPUT_NORM    = auto()
-    A_ENC_FFN_UP         = auto()
-    A_ENC_FFN_NORM       = auto()
-    A_ENC_FFN_GATE       = auto()
-    A_ENC_FFN_DOWN       = auto()
-    A_ENC_FFN_UP_1       = auto()
-    A_ENC_FFN_NORM_1     = auto()
-    A_ENC_FFN_GATE_1     = auto()
-    A_ENC_FFN_DOWN_1     = auto()
-    A_MMPROJ             = auto()
-    A_MMPROJ_FC          = auto()
-    A_MM_NORM_PRE        = auto()
-    A_MM_NORM_MID        = auto()
+    A_ENC_EMBD_POS        = auto()
+    A_ENC_EMBD_NORM       = auto()
+    A_ENC_EMBD_TO_LOGITS  = auto() # lfm2
+    A_ENC_CONV1D          = auto()
+    A_ENC_CONV1D_NORM     = auto() # gemma3n
+    A_PRE_NORM            = auto()
+    A_POST_NORM           = auto()
+    A_ENC_LAYER_PRE_NORM  = auto() # gemma3n
+    A_ENC_ATTN_Q          = auto()
+    A_ENC_ATTN_K          = auto()
+    A_ENC_ATTN_V          = auto()
+    A_ENC_PER_DIM_SCALE   = auto() # gemma3n
+    A_ENC_INPUT_NORM      = auto()
+    A_ENC_OUTPUT          = auto()
+    A_ENC_OUTPUT_NORM     = auto()
+    A_ENC_FFN_UP          = auto()
+    A_ENC_FFN_NORM        = auto()
+    A_ENC_FFN_POST_NORM   = auto() # gemma3n
+    A_ENC_FFN_SCALE       = auto() # gemma3n
+    A_ENC_FFN_GATE        = auto()
+    A_ENC_FFN_DOWN        = auto()
+    A_ENC_FFN_UP_1        = auto() # lfm2, gemma3n
+    A_ENC_FFN_NORM_1      = auto() # lfm2, gemma3n (pre-norm)
+    A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
+    A_ENC_FFN_SCALE_1     = auto() # gemma3n
+    A_ENC_FFN_GATE_1      = auto() # lfm2, gemma3n
+    A_ENC_FFN_DOWN_1      = auto() # lfm2, gemma3n
+    A_MMPROJ              = auto()
+    A_MMPROJ_FC           = auto()
+    A_MM_NORM_PRE         = auto()
+    A_MM_NORM_MID         = auto()
+    A_MM_EMBEDDING        = auto() # gemma3n
+    A_MM_HARD_EMB_NORM    = auto() # gemma3n
+    A_MM_SOFT_EMB_NORM    = auto() # gemma3n
+    A_MM_INP_PROJ         = auto() # gemma3n
    # nextn/mtp
    NEXTN_EH_PROJ        = auto()
    NEXTN_EMBED_TOKENS   = auto()
@ -1071,7 +1094,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_MM_POST_NORM:            "mm.post_norm",
    MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
    MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
-    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
+    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",         # gemma3n
+    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",             # gemma3n
+    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",         # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM:           "v.conv_stem.conv",         # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.conv_stem.bn",           # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.msfa.ffn.pw_exp.conv",   # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.msfa.ffn.pw_exp.bn",     # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ:           "v.msfa.ffn.pw_proj.conv",  # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM:      "v.msfa.ffn.pw_proj.bn",    # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_NORM:           "v.msfa.norm",              # gemma3n
    MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
    MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
    MODEL_TENSOR.V_RESMPL_ATTN_K:           "resampler.attn.k",
@ -1100,19 +1132,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.A_ENC_EMBD_NORM:           "a.position_embd_norm",
    MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS:      "a.embd_to_logits",
    MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
+    MODEL_TENSOR.A_ENC_CONV1D_NORM:         "a.conv1d.{bid}.norm",
    MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
    MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
+    MODEL_TENSOR.A_ENC_LAYER_PRE_NORM:      "a.blk.{bid}.layer_pre_norm",
    MODEL_TENSOR.A_ENC_ATTN_Q:              "a.blk.{bid}.attn_q",
    MODEL_TENSOR.A_ENC_ATTN_K:              "a.blk.{bid}.attn_k",
    MODEL_TENSOR.A_ENC_ATTN_V:              "a.blk.{bid}.attn_v",
+    MODEL_TENSOR.A_ENC_PER_DIM_SCALE:       "a.blk.{bid}.per_dim_scale",
    MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
    MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
    MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
    MODEL_TENSOR.A_ENC_FFN_NORM:            "a.blk.{bid}.ffn_norm",
+    MODEL_TENSOR.A_ENC_FFN_POST_NORM:       "a.blk.{bid}.ffn_post_norm",
+    MODEL_TENSOR.A_ENC_FFN_SCALE:           "a.blk.{bid}.ffn_scale",
    MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
    MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
    MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
    MODEL_TENSOR.A_ENC_FFN_NORM_1:          "a.blk.{bid}.ffn_norm_1",
+    MODEL_TENSOR.A_ENC_FFN_POST_NORM_1:     "a.blk.{bid}.ffn_post_norm_1",
+    MODEL_TENSOR.A_ENC_FFN_SCALE_1:         "a.blk.{bid}.ffn_scale_1",
    MODEL_TENSOR.A_ENC_FFN_UP_1:            "a.blk.{bid}.ffn_up_1",
    MODEL_TENSOR.A_ENC_FFN_GATE_1:          "a.blk.{bid}.ffn_gate_1",
    MODEL_TENSOR.A_ENC_FFN_DOWN_1:          "a.blk.{bid}.ffn_down_1",
@ -1120,6 +1159,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.A_MMPROJ_FC:               "mm.a.fc",
    MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
    MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
+    MODEL_TENSOR.A_MM_INP_PROJ:             "mm.a.input_projection",      # gemma3n
+    MODEL_TENSOR.A_MM_SOFT_EMB_NORM:        "mm.a.soft_emb_norm",         # gemma3n
+    MODEL_TENSOR.A_MM_EMBEDDING:            "mm.a.embedding",             # gemma3n
+    MODEL_TENSOR.A_MM_HARD_EMB_NORM:        "mm.a.hard_emb_norm",         # gemma3n
    # lfm2 audio
    MODEL_TENSOR.A_ENC_NORM_CONV:           "a.blk.{bid}.norm_conv",
    MODEL_TENSOR.A_ENC_LINEAR_POS:          "a.blk.{bid}.linear_pos",
@ -1170,6 +1213,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_MM_INP_PROJ,
        MODEL_TENSOR.V_MM_INP_NORM,
        MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
+        MODEL_TENSOR.V_MM_EMBEDDING,
+        MODEL_TENSOR.V_MM_HARD_EMB_NORM,
+        MODEL_TENSOR.V_ENC_CONV_STEM,
+        MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_EXP,
+        MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_PROJ,
+        MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_NORM,
        MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
        MODEL_TENSOR.V_RESMPL_ATTN_Q,
        MODEL_TENSOR.V_RESMPL_ATTN_K,
@ -1197,19 +1249,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.A_ENC_EMBD_NORM,
        MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
        MODEL_TENSOR.A_ENC_CONV1D,
+        MODEL_TENSOR.A_ENC_CONV1D_NORM,
        MODEL_TENSOR.A_PRE_NORM,
        MODEL_TENSOR.A_POST_NORM,
+        MODEL_TENSOR.A_ENC_LAYER_PRE_NORM,
        MODEL_TENSOR.A_ENC_ATTN_Q,
        MODEL_TENSOR.A_ENC_ATTN_K,
        MODEL_TENSOR.A_ENC_ATTN_V,
+        MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
        MODEL_TENSOR.A_ENC_INPUT_NORM,
        MODEL_TENSOR.A_ENC_OUTPUT,
        MODEL_TENSOR.A_ENC_OUTPUT_NORM,
        MODEL_TENSOR.A_ENC_FFN_NORM,
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM,
+        MODEL_TENSOR.A_ENC_FFN_SCALE,
        MODEL_TENSOR.A_ENC_FFN_UP,
        MODEL_TENSOR.A_ENC_FFN_GATE,
        MODEL_TENSOR.A_ENC_FFN_DOWN,
        MODEL_TENSOR.A_ENC_FFN_NORM_1,
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM_1,
+        MODEL_TENSOR.A_ENC_FFN_SCALE_1,
        MODEL_TENSOR.A_ENC_FFN_UP_1,
        MODEL_TENSOR.A_ENC_FFN_GATE_1,
        MODEL_TENSOR.A_ENC_FFN_DOWN_1,
@ -1226,6 +1285,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.A_ENC_CONV_NORM,
        MODEL_TENSOR.A_ENC_CONV_PW1,
        MODEL_TENSOR.A_ENC_CONV_PW2,
+        MODEL_TENSOR.A_MM_INP_PROJ,
+        MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
+        MODEL_TENSOR.A_MM_EMBEDDING,
+        MODEL_TENSOR.A_MM_HARD_EMB_NORM,
    ],
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -1675,6 +1738,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_POST_NORM,
        MODEL_TENSOR.ATTN_GATE,
+        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
@ -3496,6 +3560,8 @@ class GGUFValueType(IntEnum):

 class VisionProjectorType:
    GEMMA3 = "gemma3"
+    GEMMA3NV = "gemma3nv"
+    GEMMA3NA = "gemma3na"
    IDEFICS3 = "idefics3"
    PIXTRAL = "pixtral"
    LLAMA4 = "llama4"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -1086,6 +1086,9 @@ class GGUFWriter:
    def add_clip_projector_type(self, value: str) -> None:
        self.add_string(Keys.Clip.PROJECTOR_TYPE, value)

+    def add_clip_vision_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
    def add_vision_projection_dim(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)

@ -1168,6 +1171,9 @@ class GGUFWriter:

    # audio models

+    def add_clip_audio_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value)
+
    def add_audio_projection_dim(self, value: int) -> None:
        self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)

--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -123,6 +123,40 @@ class TensorNameMap:
        MODEL_TENSOR.CONV1D: (
            "backbone.embed", # roberta
        ),
+
+        MODEL_TENSOR.V_MM_EMBEDDING: (
+            "model.embed_vision.embedding", # gemma3n
+        ),
+        MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
+            "model.embed_vision.hard_embedding_norm", # gemma3n
+        ),
+        MODEL_TENSOR.V_MM_INP_PROJ: (
+            "model.embed_vision.embedding_projection", # gemma3n
+        ),
+        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
+            "model.embed_vision.soft_embedding_norm", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_CONV_STEM: (
+            "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
+            "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_EXP: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_PROJ: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_NORM: (
+            "model.vision_tower.timm_model.msfa.norm", # gemma3n
+        ),
    }

    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@ -1575,6 +1609,11 @@ class TensorNameMap:
        MODEL_TENSOR.A_ENC_CONV1D: (
            "audio_tower.conv{bid}", # ultravox
            "conformer.pre_encode.conv.{bid}", # lfm2
+            "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV1D_NORM: (
+            "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
        ),

        MODEL_TENSOR.A_PRE_NORM: (),
@ -1587,40 +1626,64 @@ class TensorNameMap:
        MODEL_TENSOR.A_ENC_ATTN_Q: (
            "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
            "conformer.layers.{bid}.self_attn.linear_q", # lfm2
+            "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_ATTN_K: (
            "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
            "conformer.layers.{bid}.self_attn.linear_k", # lfm2
+            "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_ATTN_V: (
            "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
            "conformer.layers.{bid}.self_attn.linear_v", # lfm2
+            "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
+            "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
+            "conformer.layers.{bid}.norm", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_INPUT_NORM: (
            "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
            "conformer.layers.{bid}.norm_self_att", # lfm2
+            "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_OUTPUT: (
            "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
            "conformer.layers.{bid}.self_attn.linear_out", # lfm2
+            "conformer.layers.{bid}.attention.post", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
            "audio_tower.layers.{bid}.final_layer_norm", # ultravox
            "conformer.layers.{bid}.norm_out", # lfm2
+            "conformer.layers.{bid}.attention.post_norm", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_NORM: (
            "conformer.layers.{bid}.norm_feed_forward1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
+            "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_SCALE: (
+            "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_UP: (
            "audio_tower.layers.{bid}.fc1", # ultravox
            "conformer.layers.{bid}.feed_forward1.linear1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_GATE: (),
@ -1628,22 +1691,35 @@ class TensorNameMap:
        MODEL_TENSOR.A_ENC_FFN_DOWN: (
            "audio_tower.layers.{bid}.fc2", # ultravox
            "conformer.layers.{bid}.feed_forward1.linear2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_UP_1: (
            "conformer.layers.{bid}.feed_forward2.linear1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
            "conformer.layers.{bid}.feed_forward2.linear2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_FFN_NORM_1: (
            "conformer.layers.{bid}.norm_feed_forward2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
+            "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
+            "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_LINEAR_POS: (
            "conformer.layers.{bid}.self_attn.linear_pos", # lfm2
+            "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_POS_BIAS_U: (
@ -1656,6 +1732,7 @@ class TensorNameMap:

        MODEL_TENSOR.A_ENC_OUT: (
            "conformer.pre_encode.out", # lfm2
+            "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
        ),

        # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@ -1681,22 +1758,40 @@ class TensorNameMap:

        MODEL_TENSOR.A_ENC_CONV_DW: (
            "conformer.layers.{bid}.conv.depthwise_conv", # lfm2
+            "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_CONV_NORM: (
            "conformer.layers.{bid}.conv.batch_norm", # lfm2
+            "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_CONV_PW1: (
            "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
+            "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_CONV_PW2: (
            "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
+            "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
        ),

        MODEL_TENSOR.A_ENC_NORM_CONV: (
            "conformer.layers.{bid}.norm_conv", # lfm2
+            "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_MM_EMBEDDING: (
+            "model.embed_audio.embedding", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
+            "model.embed_audio.hard_embedding_norm", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_INP_PROJ: (
+            "model.embed_audio.embedding_projection", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
+            "model.embed_audio.soft_embedding_norm", # gemma3n
        ),

        # NextN/MTP tensors for GLM4_MOE
--- a/include/llama.h
+++ b/include/llama.h
@ -309,6 +309,7 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
+        bool use_direct_io;   // use direct io, takes precedence over use_mmap
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@ -494,7 +495,7 @@ extern "C" {
                    struct llama_context_params * cparams,
                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t   margin,                // margin of memory to leave per device in bytes
+                                         size_t * margins,               // margins of memory to leave per device in bytes
                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

@ -1291,7 +1292,9 @@ extern "C" {
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+
+    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    /// Setting k <= 0 makes this a noop
--- a/licenses/LICENSE-curl
+++ b/licenses/LICENSE-curl
@ -1,9 +1,22 @@
-Copyright (c) 1996 - 2025, Daniel Stenberg, daniel@haxx.se, and many contributors, see the THANKS file.
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1996 - 2026, Daniel Stenberg, <daniel@haxx.se>, and many
+contributors, see the THANKS file.

 All rights reserved.

-Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
+Permission to use, copy, modify, and distribute this software for any purpose
+with or without fee is hereby granted, provided that the above copyright
+notice and this permission notice appear in all copies.

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
+NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.

-Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization of the copyright holder.
+Except as contained in this notice, the name of a copyright holder shall not
+be used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization of the copyright holder.
--- a/licenses/LICENSE-linenoise
+++ b/licenses/LICENSE-linenoise
@ -1,26 +0,0 @@
-Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
-Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
-Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/scripts/pr2wt.sh
+++ b/scripts/pr2wt.sh
@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# intialize a new worktree from a PR number:
+#
+# - creates a new remote using the fork's clone URL
+# - creates a local branch tracking the remote branch
+# - creates a new worktree in a parent folder, suffixed with "-pr-$PR"
+#
+# sample usage:
+#   ./scripts/pr2wt.sh 12345
+#   ./scripts/pr2wt.sh 12345 opencode
+#   ./scripts/pr2wt.sh 12345 "cmake -B build && cmake --build build"
+#   ./scripts/pr2wt.sh 12345 "bash -l"
+
+function usage() {
+    echo "usage: $0 <pr_number> [cmd]"
+    exit 1
+}
+
+# check we are in the right directory
+if [[ ! -f "scripts/pr2wt.sh" ]]; then
+    echo "error: this script must be run from the root of the repository"
+    exit 1
+fi
+
+if [[ $# -lt 1 || $# -gt 2 ]]; then
+    usage
+fi
+
+PR=$1
+[[ "$PR" =~ ^[0-9]+$ ]] || { echo "error: PR number must be numeric"; exit 1; }
+
+url_origin=$(git config --get remote.origin.url) || {
+    echo "error: no remote named 'origin' in this repository"
+    exit 1
+}
+
+org_repo=$(echo $url_origin | cut -d/ -f4-)
+org_repo=${org_repo%.git}
+
+echo "org/repo: $org_repo"
+
+meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
+
+url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
+head_ref=$(echo "$meta" | jq -r '.head.ref')
+
+echo "url:      $url_remote"
+echo "head_ref: $head_ref"
+
+url_remote_cur=$(git config --get "remote.pr/$PR.url" 2>/dev/null || true)
+
+if [[ "$url_remote_cur" != "$url_remote" ]]; then
+    git remote rm  pr/$PR 2> /dev/null
+    git remote add pr/$PR "$url_remote"
+fi
+
+git fetch "pr/$PR" "$head_ref"
+
+dir=$(basename $(pwd))
+
+git branch -D pr/$PR 2> /dev/null
+git worktree add -b pr/$PR ../$dir-pr-$PR pr/$PR/$head_ref 2> /dev/null
+
+wt_path=$(cd ../$dir-pr-$PR && pwd)
+
+echo "git worktree created in $wt_path"
+
+cd $wt_path
+git branch --set-upstream-to=pr/$PR/$head_ref
+git pull   --ff-only || {
+    echo "error: failed to pull pr/$PR"
+    exit 1
+}
+
+if [[ $# -eq 2 ]]; then
+    echo "executing: $2"
+    eval "$2"
+fi
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@ -16,7 +16,8 @@ vendor = {
    # "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
    "https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",

-    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.28.0/httplib.h": "vendor/cpp-httplib/httplib.h",
+    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.0/httplib.h": "vendor/cpp-httplib/httplib.h",
+    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.0/LICENSE":   "vendor/cpp-httplib/LICENSE",

    "https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
 }
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -950,6 +950,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_ATTN_QKV,
+                LLM_TENSOR_ATTN_GATE,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -110,7 +110,7 @@ struct llama_file::impl {
        }
    }

-    void read_raw(void * ptr, size_t len) const {
+    void read_raw(void * ptr, size_t len) {
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@ -127,7 +127,7 @@ struct llama_file::impl {
        }
    }

-    uint32_t read_u32() const {
+    uint32_t read_u32() {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
@ -154,8 +154,8 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
-        throw std::runtime_error("DirectIO is not implemented on Windows.");
+    bool has_direct_io() const {
+        return true;
    }

    ~impl() {
@ -164,33 +164,45 @@ struct llama_file::impl {
        }
    }
 #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
 #ifdef __linux__
        // Try unbuffered I/O for read only
        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
-            fd = open(fname, O_RDONLY | O_DIRECT);
-
-            if (fd != -1) {
-                struct stat file_stats{};
-                fstat(fd, &file_stats);
-
-                size = file_stats.st_size;
-                alignment = file_stats.st_blksize;
-
-                off_t ret = lseek(fd, 0, SEEK_SET);
-                if (ret == -1) {
-                    throw std::runtime_error(format("seek error: %s", strerror(errno)));
-                }
+            if (init_fd()) {
                return;
            }
-
-            LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
-                fname, strerror(errno));
+            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
+                           fname, strerror(errno));
        }
 #endif
-        fp = ggml_fopen(fname, mode);
+        init_fp(mode);
+    }
+
+#ifdef __linux__
+    bool init_fd() {
+        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
+
+        if (fd != -1) {
+            struct stat file_stats{};
+            fstat(fd, &file_stats);
+
+            size = file_stats.st_size;
+            alignment = file_stats.st_blksize;
+
+            off_t ret = lseek(fd, 0, SEEK_SET);
+            if (ret == -1) {
+                throw std::runtime_error(format("seek error: %s", strerror(errno)));
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
+    void init_fp(const char * mode) {
+        fp = ggml_fopen(fname.c_str(), mode);
        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@ -226,7 +238,7 @@ struct llama_file::impl {
        }
    }

-    void read_raw(void * ptr, size_t len) const {
+    void read_raw_unsafe(void * ptr, size_t len) {
        if (len == 0) {
            return;
        }
@ -249,6 +261,17 @@ struct llama_file::impl {
                    if (errno == EINTR) {
                        continue;  // Interrupted by signal, retry
                    }
+                    // Fallback to std::fread in case the DMA controller cannot access the buffer
+                    if (errno == EFAULT) {
+                        auto curr_off = tell();
+                        close(fd);
+                        fd = -1;
+                        alignment = 1;
+                        init_fp("rb");
+                        seek(curr_off, SEEK_SET);
+                        read_raw_unsafe(ptr, len);
+                        return;
+                    }
                    throw std::runtime_error(format("read error: %s", strerror(errno)));
                }
                if (ret == 0) {
@ -266,7 +289,8 @@ struct llama_file::impl {
        }
    }

-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+    void read_aligned_chunk(void * dest, size_t size) {
+        size_t offset = tell();
        off_t aligned_offset = offset & ~(alignment - 1);
        off_t offset_from_alignment = offset - aligned_offset;
        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@ -283,13 +307,21 @@ struct llama_file::impl {
        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);

        seek(aligned_offset, SEEK_SET);
-        read_raw(buffer.get(), bytes_to_read);
+        read_raw_unsafe(buffer.get(), bytes_to_read);

        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
    }

-    uint32_t read_u32() const {
+    void read_raw(void * ptr, size_t len) {
+        if (has_direct_io()) {
+            read_aligned_chunk(ptr, len);
+        } else {
+            read_raw_unsafe(ptr, len);
+        }
+    }
+
+    uint32_t read_u32() {
        uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
@ -310,6 +342,10 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

+    bool has_direct_io() const {
+        return fd != -1 && alignment > 1;
+    }
+
    ~impl() {
        if (fd != -1) {
            close(fd);
@ -318,17 +354,9 @@ struct llama_file::impl {
        }
    }
    int fd = -1;
+    std::string fname;
 #endif

-    void read_raw_at(void * ptr, size_t len, size_t offset) const {
-        if (alignment != 1) {
-            read_aligned_chunk(offset, ptr, len);
-        } else {
-            seek(offset, SEEK_SET);
-            read_raw(ptr, len);
-        }
-    }
-
    size_t read_alignment() const {
        return alignment;
    }
@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }

 size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }

 int llama_file::file_id() const {
 #ifdef _WIN32
@ -361,10 +390,14 @@ int llama_file::file_id() const {
 }

 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
-void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
+void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#ifdef _WIN32
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#else
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
+#endif

-uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() { return pimpl->read_u32(); }

 void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@ -24,15 +24,16 @@ struct llama_file {

    void seek(size_t offset, int whence) const;

-    void read_raw(void * ptr, size_t len) const;
-    void read_raw_at(void * ptr, size_t len, size_t offset) const;
-    void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
-    uint32_t read_u32() const;
+    void read_raw(void * ptr, size_t len);
+    void read_raw_unsafe(void * ptr, size_t len);
+    void read_aligned_chunk(void * dest, size_t size);
+    uint32_t read_u32();

    void write_raw(const void * ptr, size_t len) const;
    void write_u32(uint32_t val) const;

    size_t read_alignment() const;
+    bool has_direct_io() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
+        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
    llm_kv = LLM_KV(llm_arch_from_string(arch_name));

-    files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
+    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
    contexts.emplace_back(ctx);

+    use_direct_io = use_direct_io && files.back()->has_direct_io();
+
+    // Disable mmap in case Direct I/O is enabled and available
+    if (use_direct_io && use_mmap) {
+        use_mmap = false;
+        LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+    }
+
    // Save tensors data offset of the main file.
    // For subsidiary files, `meta` tensor data offset must not be used,
    // so we build a unified tensors index for weights.
@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
                }
            }

-            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
+            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
            contexts.emplace_back(ctx);

            // Save tensors data offset info of the shard.
@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
    }

    this->use_mmap = use_mmap;
+    this->use_direct_io = use_direct_io;
    this->check_tensors = check_tensors;
    this->no_alloc = no_alloc;
 }
@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
            const auto & file = files.at(weight->idx);

            if (ggml_backend_buffer_is_host(cur->buffer)) {
-                file->read_raw_at(cur->data, n_size, weight->offs);
+                file->seek(weight->offs, SEEK_SET);
+                file->read_raw(cur->data, n_size);
                if (check_tensors) {
                    validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                        return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
                        ggml_backend_event_synchronize(events[buffer_idx]);

                        // Read aligned chunk from file
-                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);

                        // Calculate actual data portion (excluding alignment padding)
                        uintptr_t ptr_data = ptr_dest_aligned;
@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
                    }
                } else {
                    read_buf.resize(n_size);
-                    file->read_raw_at(read_buf.data(), n_size, weight->offs);
+                    file->seek(weight->offs, SEEK_SET);
+                    file->read_raw(read_buf.data(), n_size);
                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                    if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@ -70,6 +70,7 @@ struct llama_model_loader {
    size_t   n_bytes    = 0;

    bool use_mmap = false;
+    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;

@ -97,6 +98,7 @@ struct llama_model_loader {
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
+        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -2440,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

    const bool use_mmap_buffer = true;

-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
+        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");

    // build a list of buffer types for the CPU and GPU devices
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@ -2451,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
    }

+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
+
    // calculate the split points
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
    std::vector<float> splits(n_devices());
@ -2461,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            size_t total;
            size_t free;
            ggml_backend_dev_memory(dev, &free, &total);
+
+            // devices can return 0 bytes for free and total memory if they do not
+            // have any to report. in this case, we will use the host memory as a fallback
+            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+            if (free == 0 && total == 0) {
+                ggml_backend_dev_memory(cpu_dev, &free, &total);
+            }
            splits[i] = free;
        }
    } else {
@ -2477,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        splits[i] /= split_sum;
    }

-    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@ -6754,7 +6763,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        } else {
                            // Linear attention (gated delta net) specific tensors
                            // Create tensors with calculated dimensions
-                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, 0);
+                            // note: ssm_in is used by legacy GGUF
+                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
@ -7973,6 +7985,7 @@ llama_model_params llama_model_default_params() {
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
+        /*.use_direct_io               =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
    const uint32_t seed;
    uint32_t       seed_cur;

-    std::mt19937    rng;
+    std::mt19937   rng;
 };

 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        }
    }
    for (size_t i = 0; i < ret.size(); i++) {
-        size_t free, total;
+        size_t free;
+        size_t total;
        ggml_backend_dev_memory(model->devices[i], &free, &total);
+
+        // devices can return 0 bytes for free and total memory if they do not
+        // have any to report. in this case, we will use the host memory as a fallback
+        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        if (free == 0 && total == 0) {
+            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (cpu_dev == nullptr) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            }
+            ggml_backend_dev_memory(cpu_dev, &free, &total);
+        }
        ret[i].free  = free;
        ret[i].total = total;
    }
@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
 static void llama_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    constexpr int64_t MiB = 1024*1024;
-    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    typedef std::vector<llama_device_memory_data> dmds_t;
    const llama_model_params default_mparams = llama_model_default_params();

@ -168,6 +179,12 @@ static void llama_params_fit_impl(
        return;
    }

+    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
+    margins.reserve(nd);
+    for (size_t id = 0; id < nd; id++) {
+        margins.push_back(margins_s[id]);
+    }
+
    std::vector<std::string> dev_names;
    {
        dev_names.reserve(nd);
@ -187,9 +204,10 @@ static void llama_params_fit_impl(

    int64_t sum_free            = 0;
    int64_t sum_projected_free  = 0;
-    int64_t min_projected_free  = INT64_MAX;
    int64_t sum_projected_used  = 0;
    int64_t sum_projected_model = 0;
+    std::vector<int64_t> projected_free_per_device;
+    projected_free_per_device.reserve(nd);

    if (nd > 1) {
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@ -199,45 +217,63 @@ static void llama_params_fit_impl(

        const int64_t projected_used = dmd.mb.total();
        const int64_t projected_free = dmd.free - projected_used;
+        projected_free_per_device.push_back(projected_free);

        sum_free            += dmd.free;
        sum_projected_used  += projected_used;
        sum_projected_free  += projected_free;
-        min_projected_free   = std::min(min_projected_free, projected_free);
        sum_projected_model += dmd.mb.model;

        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
-                projected_free >= 0 ? "surplus" : "deficit");
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
        }
    }
    assert(sum_free >= 0 && sum_projected_used >= 0);
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (min_projected_free >= margin) {
-        if (nd == 1) {
+    if (nd == 1) {
+        if (projected_free_per_device[0] >= margins[0]) {
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, min_projected_free/MiB, margin/MiB);
+                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+            return;
+        }
+    } else {
+        bool changes_needed = false;
+        for (size_t id = 0; id < nd; id++) {
+            if (projected_free_per_device[id] < margins[id]) {
+                changes_needed = true;
+                break;
+            }
+        }
+        if (!changes_needed) {
+            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
            return;
        }
-        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
-            __func__, min_projected_free/MiB, margin/MiB);
-        return;
    }

    // step 2: try reducing memory use by reducing the context size

    {
-        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
+        int64_t global_surplus = sum_projected_free;
+        for (size_t id = 0; id < nd; id++) {
+            global_surplus -= margins[id];
+        }
        if (global_surplus < 0) {
-            LLAMA_LOG_INFO(nd == 1 ?
-                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
-                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
-                __func__, margin/MiB, -global_surplus/MiB);
+            if (nd == 1) {
+                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                    __func__, margins[0]/MiB, -global_surplus/MiB);
+            } else {
+                LLAMA_LOG_INFO(
+                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
+                    __func__, -global_surplus/MiB);
+            }
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free - nd*margin_s;
+                    int64_t sum_used_target = sum_free;
+                    for (size_t id = 0; id < nd; id++) {
+                        sum_used_target -= margins[id];
+                    }
                    if (nd > 1) {
                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
                        //   - for dense models only whole layers can be assigned to devices
@ -448,9 +484,9 @@ static void llama_params_fit_impl(
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
-            global_surplus_cpu_moe += dmd.free;
-            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
+        for (size_t id = 0; id < nd; id++) {
+            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
+            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
        }

        if (global_surplus_cpu_moe > 0) {
@ -469,7 +505,7 @@ static void llama_params_fit_impl(
    std::vector<int64_t> targets; // maximum acceptable memory use per device
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margin);
+        targets.push_back(dmds_full[id].free - margins[id]);
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

@ -701,11 +737,11 @@ static void llama_params_fit_impl(
 enum llama_params_fit_status llama_params_fit(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    const int64_t t0_us = llama_time_us();
    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
    try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
    } catch (const llama_params_fit_exception & e) {
        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
@ -794,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;

    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
        cb(inp_per_layer, "inp_per_layer_selected", -1);
+        res->add_input(std::move(inp));
    } else {
-        GGML_ABORT("TODO: support embd input");
+        // Vision embedding path: use padding token (ID=0) embedding
+        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
+
+        // Extract and dequantize padding token embedding (column 0)
+        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
+        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
+
+        // Reshape to [n_embd_altup, n_layer, 1]
+        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
+        cb(inp_per_layer, "inp_per_layer_vision", -1);
    }
-    res->add_input(std::move(inp));
    return inp_per_layer;
 }

@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                              -1);  // [n_embd_altup, n_layer, n_tokens]
    cb(per_layer_proj, "per_layer_proj", -1);

-    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
+    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
    cb(inp_per_layer, "inp_per_layer", -1);

--- a/src/models/models.h
+++ b/src/models/models.h
@ -466,7 +466,8 @@ private:
                ggml_tensor * cur,
                        int   il);

-    ggml_tensor * build_delta_net_chunking(
+    // returns pair of output and new state
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@ -478,7 +479,8 @@ private:
                ggml_tensor * diag_mask,
                        int   il);

-    ggml_tensor * build_delta_net_autoregressive(
+    // returns pair of output and new state
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@ -493,6 +495,11 @@ private:
                ggml_tensor * gate,
                        int   layer);

+    // returns pair of qkv, z
+    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+                ggml_tensor * input,
+                        int   il);
+
    const llama_model & model;
 };

--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@ -86,7 +86,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
    ggml_build_forward_expand(gf, cur);
 }

-ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
+// utility to get one slice from the third dimension
+// input dim:  [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+    return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+        t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
        ggml_tensor * q,
        ggml_tensor * k,
        ggml_tensor * v,
@ -187,18 +195,16 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
    beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);

    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+    cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)

-    cb(g_cumsum, "g_cumsum", il);
-
-    ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+    ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
    ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);

    ggml_tensor * gcs_j_broadcast =
        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);

    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
-
-    cb(decay_mask, "decay_mask", il);
+    cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)

    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
    decay_mask = ggml_exp(ctx0, decay_mask);
@ -208,8 +214,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(

    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
-
-    cb(attn, "attn_pre_solve", il);
+    cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)

    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
@ -217,8 +222,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
    attn                     = ggml_add(ctx0, attn, identity);
-
-    cb(attn, "attn_solved", il);
+    cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)

    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);

@ -226,116 +230,126 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);

    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
-
-    cb(kbeta_gexp, "kbeta_gexp", il);
+    cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)

    ggml_tensor * k_cumdecay =
        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+    cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)

-    cb(k_cumdecay, "k_cumdecay", il);
+    ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+    attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+    attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+    cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)

+
+    // vectorized calculation of key_gdiff
+    // improved from the chunked version:
+    //   g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+    //   g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+    //   key_gdiff = key * g_diff.unsqueeze(-1)
+    //   kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+    //   last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+    // get last element in g_cumsum along chunk_size dimension (ne0)
+    // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+    ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+                                        g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+                                        (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+    g_last = ggml_cont(ctx0, g_last);
+    cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
+    cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+    cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
+    cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+
+    // state to be updated per chunk
+    ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
+    cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
+
+    // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
    ggml_tensor * core_attn_out = nullptr;
-    ggml_tensor * new_state = ggml_dup(ctx0, state);
-
-    cb(new_state, "new_state", il);

    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-        auto chunkify = [=](ggml_tensor * t) {
-            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
-                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
-        };
+        // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul

-        auto chunkify_g = [=](ggml_tensor * t) {
-            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, t->ne[1], 1, t->ne[3],
-                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
-        };
+        // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat

-        ggml_tensor * k_chunk = chunkify(k);
-        ggml_tensor * q_chunk = chunkify(q);
-        ggml_tensor * v_chunk = chunkify(v);
+        // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul

-        ggml_tensor * g_cs_chunk = chunkify_g(g_cumsum);
-        ggml_tensor * g_cs_chunk_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cs_chunk));
-
-        ggml_tensor * decay_mask_chunk = chunkify(decay_mask);
-        ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
-
-        ggml_tensor * gexp_chunk = ggml_exp(ctx0, g_cs_chunk_t);
+        // shape: (chunk_size, 1, H_v * n_seqs)
+        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat

        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-        attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
-        attn = ggml_mul(ctx0, attn, decay_mask_chunk);
-        attn = ggml_mul(ctx0, attn, diag_mask);
+        // replaced by precomputed attn_kq
+        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+        cb(attn_chunk, "attn_chunk", il);

        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);

        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+        cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)

        // v_new = v_i - v_prime
        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+        cb(v_new, "v_new_chunk", il);

        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
        ggml_tensor * q_g_exp    = ggml_mul(ctx0, q_chunk, gexp_chunk);
        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+        cb(attn_inter, "attn_inter_chunk", il);

        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
-        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+        cb(v_attn, "v_attn_chunk", il);

        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+        cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)

-        core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
+        core_attn_out = core_attn_out == nullptr
+            ? core_attn_out_chunk
+            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);

-        // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
-        // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
-        // key_gdiff = key * g_diff.unsqueeze(-1)
        // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+        ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
+        //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
+        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
+
        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
-        ggml_tensor * g_cum_last =
-            ggml_cont(ctx0, ggml_view_4d(ctx0, g_cs_chunk_t, g_cs_chunk_t->ne[0], 1, g_cs_chunk_t->ne[2], g_cs_chunk_t->ne[3],
-                                        g_cs_chunk_t->nb[1], g_cs_chunk_t->nb[2], g_cs_chunk_t->nb[3],
-                                        g_cs_chunk_t->nb[0] * (g_cs_chunk_t->ne[1] - 1)));
-
-        ggml_tensor * gexp_last =
-            ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
-
-        ggml_tensor * g_cum_last_3d =
-            ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
-
-        ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cs_chunk, g_cs_chunk->ne[0], g_cs_chunk->ne[2], g_cs_chunk->ne[3]);
-
-        ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
-
-        ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
-
-        ggml_tensor * key_gdiff = ggml_mul(ctx0, k_chunk,
-                                        ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
-                                                        g_diff_exp->ne[2] * g_diff_exp->ne[3]));
-
-        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
-
+        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
        new_state = ggml_add(ctx0,
-            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last, gexp_last->ne[0], gexp_last->ne[1], H_v, n_seqs)),
+            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
    }

-    core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
-
-    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
+    // truncate padded tokens
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+            S_v, n_tokens, H_v, n_seqs,
+            ggml_row_size(core_attn_out->type, S_v),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+    output_tokens = ggml_cont(ctx0, output_tokens);
    cb(output_tokens, "output_tokens", il);

-    // flatten output
-    ggml_tensor * flat_output =
-        ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
+    // permute back to (S_v, H_v, n_tokens, n_seqs)
+    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+    output_tokens = ggml_cont(ctx0, output_tokens);

-    ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
-
-    return ggml_concat(ctx0, flat_output, flat_state, 0);
+    return {output_tokens, new_state};
 }

-ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
        ggml_tensor * q,
        ggml_tensor * k,
        ggml_tensor * v,
@ -419,11 +433,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
    cb(core_attn_out, "output_tokens", il);
    cb(state, "new_state", il);

-    // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
-    ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
-    ggml_tensor * flat_state  = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
-
-    return ggml_concat(ctx0, flat_output, flat_state, 0);
+    return {core_attn_out, state};
 }

 ggml_tensor * llm_build_qwen3next::build_norm_gated(
@ -523,6 +533,87 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
    return cur;
 }

+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
+                ggml_tensor * input,
+                        int   il) {
+    const int64_t d_inner      = hparams.ssm_d_inner;
+    const int64_t n_seqs       = ubatch.n_seqs;
+    const int64_t head_k_dim   = hparams.ssm_d_state;
+    const int64_t num_k_heads  = hparams.ssm_n_group;
+    const int64_t num_v_heads  = hparams.ssm_dt_rank;
+    const int64_t head_v_dim   = d_inner / num_v_heads;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    if (model.layers[il].wqkv) {
+        // optimized path
+        ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+        qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
+        cb(qkv_mixed, "linear_attn_qkv_mixed", il);
+
+        ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+        cb(z, "z", il);
+
+        return { qkv_mixed, z };
+
+    } else {
+        // legacy (slower) path
+        ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
+        cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
+
+        int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
+        ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+        // Split mixed_qkvz into query, key, value, z
+        int64_t split_sizes_qkvz[4] = {
+            head_k_dim,                              // query size
+            head_k_dim,                              // key size
+            head_v_dim * num_v_heads / num_k_heads,  // value size
+            head_v_dim * num_v_heads / num_k_heads   // z size
+        };
+
+        ggml_tensor * query =
+            ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
+                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
+        cb(query, "q", il);
+
+        ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
+                                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                        split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
+        cb(key, "k", il);
+
+        ggml_tensor * value =
+            ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
+                        mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                        (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
+        cb(value, "v", il);
+
+        ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
+                                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                    (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
+        cb(z, "z", il);
+
+        // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
+        // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+        ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+        cb(query_flat, "query_flat", il);
+
+        // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+        ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+        cb(key_flat, "key_flat", il);
+
+        // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
+        ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+        cb(value_flat, "value_flat", il);
+
+        // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
+        ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
+        qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
+        cb(qkv_mixed, "qkv_mixed", il);
+
+        return { qkv_mixed, z };
+    }
+}
+
 ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
        llm_graph_input_rs * inp,
        ggml_tensor *        cur,
@ -547,15 +638,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);

    // Input projections
-    ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
-    cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
+    auto qkvz = build_qkvz(cur, il);
+    ggml_tensor * qkv_mixed = qkvz.first;
+    ggml_tensor * z         = qkvz.second;

    ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
    cb(mixed_ba, "linear_attn_mixed_ba", il);

-    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
-    ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
-
    // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
    ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
@ -575,8 +664,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
                                   split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
    cb(a, "a", il);

-    // Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
-    ggml_tensor * beta  = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
+    ggml_tensor * beta  = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
+
+    // Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);

    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
@ -585,48 +675,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);  // -A_log.exp() * softplus
    cb(gate, "gate", il);

-    // Split mixed_qkvz into query, key, value, z
-    int64_t split_sizes_qkvz[4] = {
-        head_k_dim,                              // query size
-        head_k_dim,                              // key size
-        head_v_dim * num_v_heads / num_k_heads,  // value size
-        head_v_dim * num_v_heads / num_k_heads   // z size
-    };
-
-    ggml_tensor * query =
-        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
-                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
-    cb(query, "q", il);
-
-    ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
-                                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                                     split_sizes_qkvz[0] * sizeof(float));
-    cb(key, "k", il);
-
-    ggml_tensor * value =
-        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
-                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                     (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
-    cb(value, "v", il);
-
-    ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
-                                   mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                                   (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
-    cb(z, "z", il);
-
-    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
-    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
-    ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
-    cb(query_flat, "query_flat", il);
-
-    // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
-    ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
-    cb(key_flat, "key_flat", il);
-
-    // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
-    ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
-    cb(value_flat, "value_flat", il);
-
    // Get convolution states from cache
    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
@ -637,17 +685,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
    cb(conv_states, "conv_states", il);

-    // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
-    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
-    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
-    cb(qkv_mixed, "qkv_mixed", il);
-
-    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
-    cb(qkv_mixed, "qkv_mixed_permuted", il);
-
-    // Calculate the total conv dimension
-    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
-
    // Calculate convolution kernel size
    ggml_tensor * conv_kernel      = model.layers[il].ssm_conv1d;
    const int64_t conv_kernel_size = conv_kernel->ne[0];
@ -655,6 +692,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
    cb(conv_states, "conv_states_reshaped", il);

+    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+    cb(qkv_mixed, "qkv_mixed_permuted", il);
+
    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
    cb(conv_input, "conv_input", il);

@ -677,26 +717,25 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
    cb(conv_output_proper, "conv_output_raw", il);

-    conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper));
-    cb(conv_output_proper, "conv_output_pre_silu", il);
-
    ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
    cb(conv_output_silu, "conv_output_silu", il);

-    ggml_tensor * conv_qkv_mix =
-        ggml_cont_2d(ctx0, ggml_transpose(ctx0, conv_output_silu), qkv_dim, n_seq_tokens * n_seqs);
-    cb(conv_qkv_mix, "conv_qkv_mix", il);
+    ggml_tensor * conv_qkv_mix = conv_output_silu;
+
+    // Calculate the total conv dimension
+    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+    int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);

    // Extract the convolved Q, K, V from conv_output
    ggml_tensor * q_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1], 0);
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
    cb(q_conv, "q_conv", il);
    ggml_tensor * k_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
                     head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
    cb(k_conv, "k_conv", il);
    ggml_tensor * v_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
+        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
                     2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
    cb(v_conv, "v_conv", il);

@ -705,8 +744,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
    v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);

-    beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
-
    ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
    cb(state, "state_predelta", il);
@ -738,45 +775,29 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    cb(v_conv, "v_conv_predelta", il);

    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    ggml_tensor * attn_out;
+    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
    if (n_seq_tokens == 1) {
        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
    } else {
        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
    }
-    cb(attn_out, "attn_out", il);
-
-    // The tensors were concatenated 1d, so we need to extract them 1d as well
-    const int64_t output_flat_size = head_v_dim * num_v_heads * n_seq_tokens * n_seqs;
-    ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
-    cb(attn_out_1d, "attn_out_1d", il);
-
-    ggml_tensor * attn_out_final = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
-    cb(attn_out_final, "attn_out_reshaped", il);
-
-    // Extract the state part (second part of the concatenated tensor)
-    // State starts after n_tokens elements along dimension 1
-    const int64_t state_flat_size = head_v_dim * head_v_dim * num_v_heads * n_seqs;
-
-    ggml_tensor * state_1d =
-        ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
-    cb(state_1d, "state_1d", il);
+    ggml_tensor * output    = attn_out.first;
+    ggml_tensor * new_state = attn_out.second;
+    cb(output, "attn_output", il);
+    cb(new_state, "new_state", il);

    // Update the recurrent states
    ggml_build_forward_expand(gf,
-                              ggml_cpy(ctx0, state_1d,
+                              ggml_cpy(ctx0, new_state,
                                       ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
                                                    kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));

-    GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
-
    // Reshape both attn_out_final and z to 2D tensors for normalization
    // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
-    ggml_tensor * attn_out_2d_final =
-        ggml_cont_2d(ctx0, attn_out_final, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+    ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);

    // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
-    ggml_tensor * z_2d = ggml_cont_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+    ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);

    // Apply gated normalization: self.norm(core_attn_out, z)
    ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
@ -828,12 +849,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
            shared_gate = ggml_sigmoid(ctx0, shared_gate);
            cb(shared_gate, "shared_expert_gate_sigmoid", il);

-            // The gate needs to be broadcast to match the dimensions of ffn_shexp
-            // ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
-            // We need to repeat the gate along the feature dimension
-            shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
-            cb(shared_gate, "shared_expert_gate_broadcast", il);
-
            // Apply the gate to the shared expert output
            ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
            cb(ffn_shexp, "ffn_shexp_gated", il);
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "download.h"

 #include <string>
 #include <vector>
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -454,6 +454,28 @@ static bool ggml_is_view_op(enum ggml_op op) {
    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
 }

+static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) {
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+
+    auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+    if (!get_features) {
+        return false;
+    }
+
+    const ggml_backend_feature * features = get_features(reg);
+    if (!features) {
+        return false;
+    }
+
+    for (const ggml_backend_feature * f = features; f->name; ++f) {
+        if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
 enum test_mode {
    MODE_TEST,
    MODE_PERF,
@ -1101,6 +1123,11 @@ struct test_case {
        return 1e-7;
    }

+    virtual double max_nmse_err(ggml_backend_t backend) {
+        GGML_UNUSED(backend);
+        return max_nmse_err();
+    }
+
    virtual double max_maa_err() {
        return 1e-4;
    }
@ -1109,6 +1136,10 @@ struct test_case {
        return max_nmse_err();
    }

+    virtual double max_err(ggml_backend_t backend) {
+        return max_nmse_err(backend);
+    }
+
    virtual double err(const float * a, const float * b, size_t n) {
        return nmse(a, b, n);
    }
@ -1378,8 +1409,8 @@ struct test_case {
            }

            double err = ud->tc->err(f1.data(), f2.data(), f1.size());
-            if (err > ud->tc->max_err()) {
-                printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err());
+            if (err > ud->tc->max_err(ud->backend1)) {
+                printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1));
                //for (int i = 0; i < (int) f1.size(); i++) {
                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                //}
@ -3686,6 +3717,14 @@ struct test_mul_mat : public test_case {
        return 5e-4;
    }

+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
    int64_t grad_nmax() override {
        return 20000;
    }
@ -3814,6 +3853,14 @@ struct test_mul_mat_id : public test_case {
        return 5e-4;
    }

+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
    uint64_t op_flops(ggml_tensor * t) override {
        GGML_UNUSED(t);
        return 2 * m * k * n * n_used;
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -18,14 +18,13 @@ else()
    add_subdirectory(gguf-split)
    add_subdirectory(imatrix)
    add_subdirectory(llama-bench)
-    add_subdirectory(cli)
    add_subdirectory(completion)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    if (LLAMA_BUILD_SERVER)
+        add_subdirectory(cli)
        add_subdirectory(server)
    endif()
-    add_subdirectory(run)
    add_subdirectory(tokenize)
    add_subdirectory(tts)
    add_subdirectory(mtmd)
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
    auto mparams = common_model_params_to_llama(params);
    auto cparams = common_context_params_to_llama(params);
    const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
        params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
        LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@ -27,6 +27,7 @@ add_library(mtmd
            models/qwen3vl.cpp
            models/siglip.cpp
            models/whisper-enc.cpp
+            models/mobilenetv5.cpp
            models/youtuvl.cpp
            )

--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -154,6 +154,47 @@
 #define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"

+// mobilenetv5 (gemma3n) definitions
+#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
+
+// Stage 0 Block (Edge Residual)
+#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
+
+// Stage 1+ Block (Universal Inverted Residual)
+#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
+
+// Attention Components
+#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
+
+// MSFA
+#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
+
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@ -171,6 +212,8 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_GEMMA3NV,
+    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@ -203,6 +246,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
+    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -173,6 +173,45 @@ struct clip_layer {
    }
 };

+// Expanded MobileNetV5 block structure for Gemma3n vision encoder
+struct mobilenetv5_block {
+    // Stage 0 (Edge Residual)
+    ggml_tensor * s0_conv_exp_w = nullptr;
+    ggml_tensor * s0_bn1_w      = nullptr;
+    ggml_tensor * s0_conv_pwl_w = nullptr;
+    ggml_tensor * s0_bn2_w      = nullptr;
+
+    // Stage 1+ (Universal Inverted Residual)
+    ggml_tensor * dw_start_w    = nullptr;
+    ggml_tensor * dw_start_bn_w = nullptr;
+
+    ggml_tensor * pw_exp_w      = nullptr;
+    ggml_tensor * pw_exp_bn_w   = nullptr;
+
+    ggml_tensor * dw_mid_w      = nullptr;
+    ggml_tensor * dw_mid_bn_w   = nullptr;
+
+    ggml_tensor * pw_proj_w     = nullptr;
+    ggml_tensor * pw_proj_bn_w  = nullptr;
+
+    ggml_tensor * layer_scale_w = nullptr;
+
+    // Attention (MQA) components
+    ggml_tensor * attn_q_w = nullptr;
+    ggml_tensor * attn_k_w = nullptr;
+    ggml_tensor * attn_v_w = nullptr;
+    ggml_tensor * attn_o_w = nullptr;
+
+    // Optional downsampling/norm in attention
+    ggml_tensor * attn_k_dw_w   = nullptr;
+    ggml_tensor * attn_k_norm_w = nullptr;
+    ggml_tensor * attn_v_dw_w   = nullptr;
+    ggml_tensor * attn_v_norm_w = nullptr;
+
+    // Block norm (often present in attention blocks)
+    ggml_tensor * attn_norm_w   = nullptr;
+};
+
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -289,6 +328,23 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;

+    // mobilenetv5 for gemma3n
+    std::vector<mobilenetv5_block> mobilenet_blocks;
+    std::vector<int> mobilenet_stage_ends;
+    ggml_tensor * mobilenet_stem_conv_w = nullptr;
+    ggml_tensor * mobilenet_stem_conv_b = nullptr;
+    ggml_tensor * mobilenet_stem_norm_w = nullptr;
+    ggml_tensor * mm_post_proj_norm_w = nullptr;
+
+    // Multi-Scale Fusion Adapter (MSFA) components
+    ggml_tensor * msfa_concat_conv_w = nullptr;
+    ggml_tensor * msfa_concat_norm_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_w = nullptr;
+    ggml_tensor * msfa_ffn_project_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_bn = nullptr;
+    ggml_tensor * msfa_ffn_project_bn = nullptr;
+
+
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -788,6 +788,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
+            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@ -1146,6 +1150,14 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
+
+                case PROJECTOR_TYPE_GEMMA3NV:
+                    {
+                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
+                        // Similar configuration to Gemma3
+                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@ -1334,6 +1346,10 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

+        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
+            hparams.n_layer = 0; // gemma3n does not use normal layer structure
+        }
+
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@ -1408,6 +1424,7 @@ struct clip_model_loader {
            }
        }

+
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@ -1547,6 +1564,99 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
+            case PROJECTOR_TYPE_GEMMA3NV:
+                {
+                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
+                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
+                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
+
+                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
+                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
+                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
+                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
+
+                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
+
+                    // Dynamically load blocks stage by stage
+                    for (int stage = 0; stage < 4; ++stage) {
+                        int blocks_found_in_stage = 0;
+
+                        for (int blk_idx = 0; ; ++blk_idx) {
+                            bool found_block = false;
+                            mobilenetv5_block block;
+
+                            // 1. Check for Edge Residual (S0)
+                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
+                            if (block.s0_conv_exp_w) {
+                                found_block = true;
+                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
+                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
+                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
+                            }
+                            // 2. Check for UIR (Universal Inverted Residual)
+                            else {
+                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
+                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
+                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
+
+                                if (block.dw_start_w || block.pw_exp_w) {
+                                    found_block = true;
+                                    if (block.dw_start_w) {
+                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
+                                    }
+                                    if (block.pw_exp_w) {
+                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
+                                    }
+                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
+                                    if (block.dw_mid_w) {
+                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
+                                    }
+                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
+                                    if (block.pw_proj_w) {
+                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
+                                    }
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            // 3. Check for Attention (MQA)
+                            // Even if UIR/Edge check failed, this might be a pure attention block
+                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
+                            if (attn_q_check) {
+                                found_block = true;
+                                block.attn_q_w = attn_q_check;
+                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
+                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
+                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
+                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
+                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
+                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
+                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
+                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
+                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
+                                if (!block.layer_scale_w) {
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            if (found_block) {
+                                model.mobilenet_blocks.push_back(block);
+                                blocks_found_in_stage++;
+                            } else {
+                                // End of blocks for this stage
+                                break;
+                            }
+                        }
+
+                        // Track where this stage ends in the flat vector
+                        if (blocks_found_in_stage > 0) {
+                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
+                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
+                        }
+                    }
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
@ -2002,6 +2112,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
+        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@ -2011,10 +2122,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

+            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
+            // we can remove this check when we implement audio support for Gemma 3N
+            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
+
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio) {
+        if (loader.has_audio && !skip_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@ -2852,6 +2967,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@ -3114,6 +3239,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
+                // regardless of input size (see architecture description)
+                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
+            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@ -3506,6 +3637,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@ -3633,6 +3765,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
@ -3663,6 +3796,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
+    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@ -3670,24 +3804,26 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
+    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

 bool clip_is_mrope(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
-        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return true;
+        default:
+            return false;
+    }
 }

 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

-bool clip_is_gemma3(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
-}
-
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@ -3697,11 +3833,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
-        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
-        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
-        || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_GLMA:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            return true;
+        default:
+            return false;
+    }
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -106,7 +106,8 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-bool clip_is_gemma3(const struct clip_ctx * ctx);
+// note for contributor: this clip_is_(model) pattern is deprecated
+//                       do NOT add new functions like this

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@ -0,0 +1,451 @@
+#include "models.h"
+
+// Helpers for MobileNetV5 Blocks
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+    // inp: [W, H, C, B]
+
+    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_rms_norm(ctx0, cur, eps);
+
+    if (weight) {
+        cur = ggml_mul(ctx0, cur, weight);
+    }
+
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    return cur;
+}
+
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+    const int64_t ih = inp->ne[1];  // height
+    const int64_t iw = inp->ne[0];  // width
+
+    // Calculate output size (ceil division)
+    const int64_t oh = (ih + stride_h - 1) / stride_h;
+    const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+    // Calculate padding needed
+    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+    // Split padding asymmetrically
+    const int pad_h_top = pad_h / 2;
+    const int pad_h_bottom = pad_h - pad_h_top;
+    const int pad_w_left = pad_w / 2;
+    const int pad_w_right = pad_w - pad_w_left;
+
+    // Apply padding if needed
+    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+    if (pad_h > 0 || pad_w > 0) {
+        inp = ggml_pad_ext(ctx0, inp,
+            pad_w_left, pad_w_right,     // width padding (dim 0)
+            pad_h_top, pad_h_bottom,      // height padding (dim 1)
+            0, 0,                         // no channel padding (dim 2)
+            0, 0);                        // no batch padding (dim 3)
+    }
+
+    return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Expansion Conv (3x3)
+    if (stride == 2) {
+        // Case: Downsampling (Block 0)
+        // Replicates Conv2dSame(kernel=3, stride=2)
+        cur = pad_same_2d(cur, 3, 3, stride, stride);
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+    } else {
+        // Case: Normal 3x3 Block (Block 1, 2)
+        // Replicates Conv2d(kernel=3, stride=1, padding=1)
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+    }
+
+    // BN + Activation
+    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+    cur = ggml_gelu(ctx0, cur);
+
+    // 2. Pointwise Linear Conv (1x1)
+    // 1x1 Convs usually have padding=0 and stride=1
+    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+    // 3. Residual Connection
+    // Only apply residual if spatial dimensions and channels match (stride 1)
+    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// Universal Inverted Residual Block (Stage 1+)
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Depthwise Start (Optional)
+    // NOTE: dw_start always has stride=1 (no downsampling here)
+    if (block.dw_start_w) {
+        int k = block.dw_start_w->ne[0]; // 3 or 5
+        int p = k / 2;
+        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+    }
+
+    // 2. Pointwise Expansion (1x1)
+    if (block.pw_exp_w) {
+        // Standard 1x1 conv, pad=0, stride=1
+        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 3. Depthwise Mid (Optional)
+    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+    if (block.dw_mid_w) {
+        int k = block.dw_mid_w->ne[0]; // 3 or 5
+
+        if (stride > 1) {
+            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+            cur = pad_same_2d(cur, k, k, stride, stride);
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+        } else {
+            // Case: Stride 1 -> Use Standard Symmetric Padding
+            int p = k / 2;
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+        }
+
+        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 4. Pointwise Projection (1x1)
+    if (block.pw_proj_w) {
+        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+    }
+
+    // Apply Layer Scaling if present
+    if (block.layer_scale_w) {
+        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+    }
+
+    // 5. Residual Connection
+    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+    bool same_channel = (inp->ne[2] == cur->ne[2]);
+    if (same_spatial && same_channel) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// Attention Block (MQA)
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+    ggml_tensor * cur = inp;
+
+    // Norm
+    if (block.attn_norm_w) {
+        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+    }
+
+    // 1. Q Calculation
+    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+    // 2. K Calculation (Downsampled)
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * k_inp = cur;
+    if (block.attn_k_dw_w) {
+        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
+        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
+        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_k_norm_w) {
+            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+    // 3. V Calculation (Downsampled)
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * v_inp = cur;
+    if (block.attn_v_dw_w) {
+        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
+        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
+        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_v_norm_w) {
+            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+    const int D = k->ne[2]; // Head dimension
+    const int n_head = q->ne[2] / D;
+    const int N = W * H;
+
+    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+    q = ggml_cont(ctx0, q);
+
+    const int Wk = k->ne[0]; const int Hk = k->ne[1];
+    const int M = Wk * Hk;
+
+    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+    k = ggml_reshape_3d(ctx0, k, M, D, B);
+    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+    k = ggml_cont(ctx0, k);
+
+    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+    v = ggml_reshape_3d(ctx0, v, M, D, B);
+    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+    v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+    // Multi-Query Attention
+    float scale = 1.0f / sqrtf((float)D);
+
+    // Step 1: Compute Q @ K.T
+    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+    scores = ggml_scale(ctx0, scores, scale);
+
+    scores = ggml_soft_max(ctx0, scores);
+
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+    kqv = ggml_cont(ctx0, kqv);
+
+
+    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+    kqv = ggml_cont(ctx0, kqv);
+
+    // Output projection
+    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+    // Residual & Layer Scale
+    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+        if (block.layer_scale_w) {
+            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+        }
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+    ggml_tensor * inp = build_inp_raw();
+
+    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
+
+    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
+    if (model.mobilenet_stem_conv_b) {
+        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
+    }
+    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+    cur = ggml_gelu(ctx0, cur);
+
+
+    // 2. Blocks
+    std::vector<ggml_tensor*> intermediate_features;
+    const int total_blocks = model.mobilenet_blocks.size();
+
+    auto is_stage_start = [&](int i) {
+        if (i == 0) return true;
+        for (int end_idx : model.mobilenet_stage_ends) {
+            if (i == end_idx + 1) return true;
+        }
+        return false;
+    };
+
+    auto is_fusion_point = [&](int i) {
+        if (model.mobilenet_stage_ends.size() >= 4) {
+                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+        } else {
+            if (i == total_blocks - 1) return true;
+        }
+        return false;
+    };
+
+    for (int i = 0; i < total_blocks; i++) {
+        const auto & block = model.mobilenet_blocks[i];
+        int stride = is_stage_start(i) ? 2 : 1;
+
+        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
+        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
+        else                          cur = build_inverted_residual(cur, block, stride);
+
+        if (is_fusion_point(i)) {
+
+            intermediate_features.push_back(cur);
+        }
+    }
+
+    // 3. Multi-Scale Fusion Adapter (MSFA)
+    if (!intermediate_features.empty()) {
+
+        // A. Reference Resolution: PyTorch implementation uses inputs[0]
+        // We assume intermediate_features[0] is the "High Resolution" target.
+        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+        ggml_tensor* target_feat = intermediate_features[0];
+        int high_res_w = target_feat->ne[0];
+        int high_res_h = target_feat->ne[1];
+
+        std::vector<ggml_tensor*> resized_feats;
+
+        // B. Resize inputs to match inputs[0] (High Resolution)
+        for (auto feat : intermediate_features) {
+            int feat_w = feat->ne[0];
+            int feat_h = feat->ne[1];
+
+            // PyTorch: if feat_size < high_resolution: interpolate
+            if (feat_w < high_res_w || feat_h < high_res_h) {
+                // Calculate scale factor.
+                // Note: PyTorch 'nearest' works on arbitrary float scales.
+                // ggml_upscale generally takes integer factors or target sizes depending on helper.
+                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+                int scale_w = high_res_w / feat_w;
+                // int scale_h = high_res_h / feat_h;
+
+                // Safety check for non-integer scaling if strictly replicating
+                GGML_ASSERT(high_res_w % feat_w == 0);
+
+                // Upsample (Nearest Neighbor)
+                // 2 is the scale factor
+                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+            }
+            resized_feats.push_back(feat);
+        }
+
+        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+        cur = resized_feats[0];
+        for (size_t k = 1; k < resized_feats.size(); ++k) {
+            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+        }
+
+        // D. FFN (UniversalInvertedResidual)
+        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+
+        // 1. Expansion
+        if (model.msfa_ffn_expand_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+
+            if (model.msfa_ffn_expand_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
+            }
+
+            cur = ggml_gelu(ctx0, cur);
+
+        }
+
+        // 2. Projection (No DW because kernel_size=0)
+        if (model.msfa_ffn_project_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+
+            // UniversalInvertedResidual typically has a norm after projection
+            if (model.msfa_ffn_project_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+            }
+
+        }
+
+        // E. Final Downsample to Target Resolution (Output Resolution)
+        // PyTorch: matches self.output_resolution (e.g. 16x16)
+        const int target_out_res = 16;
+        int current_w = cur->ne[0];
+
+        if (current_w > target_out_res) {
+            int s = current_w / target_out_res;
+
+            GGML_ASSERT(current_w % target_out_res == 0);
+
+            // Avg Pool: Kernel=s, Stride=s
+            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+
+        }
+
+        // F. Final Norm
+        if (model.msfa_concat_norm_w) {
+            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+        }
+    }
+
+    // 4. Gemma 3n Multimodal Projection (Embedder)
+    // Input: 'cur' is [Width, Height, Channels, Batch]
+    int W = cur->ne[0];
+    int H = cur->ne[1];
+    int C = cur->ne[2];
+    int B = cur->ne[3];
+
+    GGML_ASSERT(C == hparams.n_embd);
+
+    // 1. Permute and Flatten to [Channels, Tokens, Batch]
+    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+    cur = ggml_cont(ctx0, cur);
+
+
+    // 2. FEATURE SCALING
+    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+    const float scale_factor = sqrtf((float)C);
+    cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+    // 3. SOFT EMBEDDING NORM
+    // PyTorch: self._norm(x) * self.weight
+    // We must normalize regardless, then multiply if weight exists.
+    {
+        const float eps = 1e-6f; // Gemma3n uses 1e-6
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
+        if (model.mm_soft_emb_norm_w) {
+            // Weight shape is (2048,) -> Element-wise broadcast multiply
+            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+        }
+
+    }
+
+    // 4. PROJECTION
+    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+    if (model.mm_input_proj_w) {
+        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+    }
+
+    // 5. POST PROJECTION NORM
+    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+    // with_scale=False means weight is registered as buffer with value 1.0
+    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+    {
+        const float eps = 1e-6f;
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
+        if (model.mm_post_proj_norm_w) {
+            // If weight is loaded, multiply (should be ~1.0 anyway)
+            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+        }
+    }
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -76,3 +76,36 @@ struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
+
+struct clip_graph_mobilenetv5 : clip_graph {
+    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+
+    ggml_tensor * rms_norm_2d(
+        ggml_tensor * inp,
+        ggml_tensor * weight,
+        float eps = 1e-6f);
+
+    ggml_tensor* pad_same_2d(
+        ggml_tensor* inp,
+        int kernel_h,
+        int kernel_w,
+        int stride_h,
+        int stride_w,
+        int dilation_h = 1,
+        int dilation_w = 1);
+
+    ggml_tensor * build_edge_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride);
+
+    ggml_tensor * build_inverted_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride);
+
+    ggml_tensor * build_mobilenet_attn(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block);
+};
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -266,7 +266,7 @@ struct mtmd_context {
        }

        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3) {
+        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@ -862,10 +862,15 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
-        return true;
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_YOUTUVL:
+            return true;
+        default:
+            return false;
    }
-    return false;
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@ -1,23 +0,0 @@
-set(TARGET llama-run)
-add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
-
-# TODO: avoid copying this code block from common/CMakeLists.txt
-set(LLAMA_RUN_EXTRA_LIBS "")
-if (LLAMA_CURL)
-    find_package(CURL REQUIRED)
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
-    include_directories(${CURL_INCLUDE_DIRS})
-    set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
-endif ()
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
-
-if (CMAKE_SYSTEM_NAME MATCHES "AIX")
-    # AIX's flock() function comes from libbsd.a
-    target_link_libraries(${TARGET} PRIVATE -lbsd)
-endif()
-
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/tools/run/README.md
+++ b/tools/run/README.md
@ -1,52 +0,0 @@
-# llama.cpp/example/run
-
-The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
-
-```bash
-llama-run granite3-moe
-```
-
-```bash
-Description:
-  Runs a llm
-
-Usage:
-  llama-run [options] model [prompt]
-
-Options:
-  -c, --context-size <value>
-      Context size (default: 2048)
-  -n, -ngl, --ngl <value>
-      Number of GPU layers (default: 0)
-  --temp <value>
-      Temperature (default: 0.8)
-  -v, --verbose, --log-verbose
-      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
-  -h, --help
-      Show help message
-
-Commands:
-  model
-      Model is a string with an optional prefix of
-      huggingface:// (hf://), ollama://, https:// or file://.
-      If no protocol is specified and a file exists in the specified
-      path, file:// is assumed, otherwise if a file does not exist in
-      the specified path, ollama:// is assumed. Models that are being
-      pulled are downloaded with .partial extension while being
-      downloaded and then renamed as the file without the .partial
-      extension when complete.
-
-Examples:
-  llama-run llama3
-  llama-run ollama://granite-code
-  llama-run ollama://smollm:135m
-  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
-  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
-  llama-run ms://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
-  llama-run modelscope://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
-  llama-run https://example.com/some-file1.gguf
-  llama-run some-file2.gguf
-  llama-run file://some-file3.gguf
-  llama-run --ngl 999 some-file4.gguf
-  llama-run --ngl 999 some-file5.gguf Hello World
-```
--- a/tools/run/linenoise.cpp/linenoise.cpp
+++ b/tools/run/linenoise.cpp/linenoise.cpp
--- a/tools/run/linenoise.cpp/linenoise.h
+++ b/tools/run/linenoise.cpp/linenoise.h
@ -1,137 +0,0 @@
-/* linenoise.h -- VERSION 1.0
- *
- * Guerrilla line editing library against the idea that a line editing lib
- * needs to be 20,000 lines of C++ code.
- *
- * See linenoise.cpp for more information.
- *
- * ------------------------------------------------------------------------
- *
- * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
- * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
- * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *  *  Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *
- *  *  Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __LINENOISE_H
-#define __LINENOISE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stddef.h> /* For size_t. */
-#include <stdlib.h>
-
-extern const char * linenoiseEditMore;
-
-/* The linenoiseState structure represents the state during line editing.
- * We pass this state to functions implementing specific editing
- * functionalities. */
-struct linenoiseState {
-    int          in_completion;  /* The user pressed TAB and we are now in completion
-                         * mode, so input is handled by completeLine(). */
-    size_t       completion_idx; /* Index of next completion to propose. */
-    int          ifd;            /* Terminal stdin file descriptor. */
-    int          ofd;            /* Terminal stdout file descriptor. */
-    char *       buf;            /* Edited line buffer. */
-    size_t       buflen;         /* Edited line buffer size. */
-    const char * prompt;         /* Prompt to display. */
-    size_t       plen;           /* Prompt length. */
-    size_t       pos;            /* Current cursor position. */
-    size_t       oldcolpos;      /* Previous refresh cursor column position. */
-    size_t       len;            /* Current edited line length. */
-    size_t       cols;           /* Number of columns in terminal. */
-    size_t       oldrows;        /* Rows used by last refreshed line (multiline mode) */
-    int          history_index;  /* The history index we are currently editing. */
-};
-
-struct linenoiseCompletions {
-    size_t  len     = 0;
-    char ** cvec    = nullptr;
-    bool    to_free = true;
-
-    ~linenoiseCompletions() {
-        if (!to_free) {
-            return;
-        }
-
-        for (size_t i = 0; i < len; ++i) {
-            free(cvec[i]);
-        }
-
-        free(cvec);
-    }
-};
-
-/* Non blocking API. */
-int          linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen,
-                                const char * prompt);
-const char * linenoiseEditFeed(struct linenoiseState * l);
-void         linenoiseEditStop(struct linenoiseState * l);
-void         linenoiseHide(struct linenoiseState * l);
-void         linenoiseShow(struct linenoiseState * l);
-
-/* Blocking API. */
-const char * linenoise(const char * prompt);
-void         linenoiseFree(void * ptr);
-
-/* Completion API. */
-typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
-typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold);
-typedef void(linenoiseFreeHintsCallback)(const char *);
-void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
-void linenoiseSetHintsCallback(linenoiseHintsCallback *);
-void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
-void linenoiseAddCompletion(linenoiseCompletions *, const char *);
-
-/* History API. */
-int linenoiseHistoryAdd(const char * line);
-int linenoiseHistorySetMaxLen(int len);
-int linenoiseHistorySave(const char * filename);
-int linenoiseHistoryLoad(const char * filename);
-
-/* Other utilities. */
-void linenoiseClearScreen(void);
-void linenoiseSetMultiLine(int ml);
-void linenoisePrintKeyCodes(void);
-void linenoiseMaskModeEnable(void);
-void linenoiseMaskModeDisable(void);
-
-/* Encoding functions. */
-typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
-typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
-typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c);
-
-void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc,
-                                   linenoiseReadCode * readCodeFunc);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __LINENOISE_H */
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1,10 +1,10 @@
 #include "common.h"
+#include "download.h"
 #include "log.h"
 #include "llama.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "chat.h"
-#include "arg.h" // for common_remote_get_content; TODO: use download.h only
 #include "base64.hpp"

 #include "server-common.h"
@ -779,7 +779,7 @@ static void handle_media(
        // download remote image
        // TODO @ngxson : maybe make these params configurable
        common_remote_params params;
-        params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+        params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
        params.max_size = 1024 * 1024 * 10; // 10MB
        params.timeout  = 10; // seconds
        SRV_INF("downloading image from '%s'\n", url.c_str());
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -4,7 +4,6 @@
 #include "server-task.h"
 #include "server-queue.h"

-#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@ -16,7 +15,6 @@
 #include <cstddef>
 #include <cinttypes>
 #include <memory>
-#include <unordered_set>
 #include <filesystem>

 // fix problem with std::min and std::max
@ -81,6 +79,8 @@ struct server_slot {

    common_speculative * spec = nullptr;

+    // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
+    //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
    std::unique_ptr<const server_task> task;
    std::unique_ptr<const server_task> task_prev; // used for debugging

@ -155,7 +155,7 @@ struct server_slot {

    common_sampler_ptr smpl;

-    llama_token sampled; // in speculative mode, this is the last accepted token
+    llama_token  sampled; // in speculative mode, this is the last accepted token
    llama_tokens drafted;

    // stats
@ -203,12 +203,46 @@ struct server_slot {
        alora_invocation_start = -1;
    }

+    // remove cached prompt + tokens
+    void clear(bool allow_processing) {
+        if (!allow_processing) {
+            GGML_ASSERT(!is_processing());
+        }
+
+        SLT_INF(*this, "clearing slot with %zu tokens\n", prompt.tokens.size());
+
+        llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
+        prompt.tokens.clear();
+    }
+
+    void init_sampler() const {
+        const int64_t t_start = ggml_time_us();
+
+        common_sampler_reset(smpl.get());
+
+        int n_text = 0;
+
+        for (int i = 0; i < (int) prompt.tokens.size(); i++) {
+            const llama_token id = prompt.tokens[i];
+
+            if (id != LLAMA_TOKEN_NULL) {
+                common_sampler_accept(smpl.get(), id, false);
+                n_text++;
+            }
+        }
+
+        SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
+                (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
+    }
+
+    // TODO: move to server_task
    bool need_embd() const {
        GGML_ASSERT(task);

        return server_task_type_need_embd(task->type);
    }

+    // TODO: move to server_task
    bool need_logits() const {
        GGML_ASSERT(task);

@ -260,10 +294,13 @@ struct server_slot {
            SLT_WRN(*this, "%s", "slot is not processing\n");
            return;
        }
+
        generated_token_probs.push_back(token);
    }

    int get_n_draft_max() const {
+        GGML_ASSERT(task);
+
        if (!can_speculate()) {
            return 0;
        }
@ -289,12 +326,14 @@ struct server_slot {
    }

    // note: a slot can also be either a parent or a child
+    // TODO: move to server_task
    bool is_parent() const {
-        return is_processing() && task->n_children > 0;
+        return task->n_children > 0;
    }

+    // TODO: move to server_task
    bool is_child() const {
-        return is_processing() && task->id_parent >= 0;
+        return task->id_parent >= 0;
    }

    void release() {
@ -303,10 +342,16 @@ struct server_slot {

            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);

-            t_last_used = ggml_time_us();
+            t_last_used        =  ggml_time_us();
            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+
            state = SLOT_STATE_IDLE;

+            // do not keep context of the child slots - the parent's context is enough
+            if (is_child()) {
+                clear(false);
+            }
+
            task_prev = std::move(task);
            task.reset();

@ -427,14 +472,22 @@ struct server_slot {
    }

    void copy_state_to(server_slot & other) const {
-        llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
-        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
+        GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
+
+        llama_memory_seq_rm(llama_get_memory(ctx), other.id,     -1, -1);
+        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
+
        other.n_decoded   = n_decoded;
        other.n_remaining = n_remaining;
        other.i_batch     = i_batch;
+
+        other.t_start_process_prompt    = t_start_process_prompt;
+        other.t_prompt_processing       = t_prompt_processing;
        other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
        other.n_prompt_tokens_processed = n_prompt_tokens_processed;
+
        other.prompt = prompt.clone();
+        other.init_sampler();
    }
 };

@ -747,6 +800,7 @@ private:
        }

        slots.clear();
+
        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;

@ -995,7 +1049,7 @@ private:
                ret->prompt_save(*prompt_cache);

                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
-                    clear_slot(*ret);
+                    ret->clear(false);
                }

                prompt_cache->update();
@ -1007,17 +1061,6 @@ private:
        return ret;
    }

-    void clear_slot(server_slot & slot, bool allow_processing = false) const {
-        if (!allow_processing) {
-            GGML_ASSERT(!slot.is_processing());
-        }
-
-        SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
-
-        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
-        slot.prompt.tokens.clear();
-    }
-
    // return true if at least one slot has been cleared
    // TODO: improve logic
    //       - smarter decision which slot to clear (LRU or longest prompt?)
@ -1038,7 +1081,7 @@ private:
            if (slot.prompt.n_tokens() > 0) {
                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());

-                clear_slot(slot);
+                slot.clear(false);

                res = true;

@ -1184,7 +1227,7 @@ private:
            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
            : SLOT_STATE_STARTED;

-        SLT_INF(slot, "%s", "processing task\n");
+        SLT_INF(slot, "processing task, is_child = %d\n", slot.is_child());

        return true;
    }
@ -1821,7 +1864,7 @@ private:
                    // Erase token cache
                    const size_t n_erased = slot->prompt.tokens.size();

-                    clear_slot(*slot);
+                    slot->clear(false);

                    auto res = std::make_unique<server_task_result_slot_erase>();
                    res->id       = task.id;
@ -2055,8 +2098,29 @@ private:
                    continue;
                }

+                // check if this is a child slot
+                if (slot.state == SLOT_STATE_WAIT_OTHER) {
+                    SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
+                    continue;
+                }
+
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
+                    // wait for all children to be launched
+                    if (slot.is_parent()) {
+                        int n_launched = 0;
+                        for (auto & other : slots) {
+                            if (other.is_processing() && other.is_child() && other.task->id_parent == slot.task->id) {
+                                ++n_launched;
+                            }
+                        }
+
+                        if (n_launched < slot.task->n_children) {
+                            SLT_DBG(slot, "waiting for children to be launched, n_children = %d, n_launched = %d\n", slot.task->n_children, n_launched);
+                            continue;
+                        }
+                    }
+
                    const auto & input_tokens = slot.task->tokens;

                    // TODO: maybe move branch to outside of this loop in the future
@ -2357,7 +2421,7 @@ private:
                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);

-                        clear_slot(slot, /*allow_processing=*/true);
+                        slot.clear(true);

                        // there is no common part left
                        slot.n_prompt_tokens_cache = 0;
@ -2457,16 +2521,6 @@ private:

                        GGML_ASSERT(batch.n_tokens > 0);

-                        common_sampler_reset(slot.smpl.get());
-
-                        // Process all prompt tokens through sampler system
-                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
-                            llama_token id = input_tokens[i];
-                            if (id != LLAMA_TOKEN_NULL) {
-                                common_sampler_accept(slot.smpl.get(), id, false);
-                            }
-                        }
-
                        // extract the logits only for the last token
                        batch.logits[batch.n_tokens - 1] = true;

@ -2475,6 +2529,8 @@ private:

                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);

+                        slot.init_sampler();
+
                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);

@ -2521,11 +2577,6 @@ private:
            }
        }

-        if (batch.n_tokens == 0) {
-            SRV_WRN("%s", "no tokens to decode\n");
-            return;
-        }
-
        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);

        if (slot_batched) {
@ -2542,6 +2593,10 @@ private:
            llama_set_embeddings(ctx, slot_batched->need_embd());
        }

+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+        }
+
        int32_t i_next = 0;

        // process the created batch of tokens
@ -2593,7 +2648,7 @@ private:

                                // note: it's complicated to keep track of how much of the current batch has been
                                //       processed before the error occurred, so we simply clear the entire context
-                                clear_slot(slot);
+                                slot.clear(false);
                            }
                        }

@ -2617,31 +2672,34 @@ private:
            // on successful decode, restore the original batch size
            n_batch = llama_n_batch(ctx);

-            // technically, measuring the time here excludes the sampling time for the last batch
-            // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
-            const int64_t t_current = ggml_time_us();
-
+            // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
            for (auto & slot : slots) {
-                // may need to copy state to other slots
                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
-                    std::vector<server_slot *> child_slots;
+                    SLT_INF(slot, "parent task prompt done, n_children = %d\n", slot.task->n_children);
+
+                    std::vector<server_slot *> children;
                    for (auto & other : slots) {
                        if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
-                            child_slots.push_back(&other);
+                            children.push_back(&other);
                        }
                    }

                    // we can only proceed if all child slots are having the correct tasks
-                    if (child_slots.size() == slot.task->n_children) {
+                    if (slot.task->n_children == (int) children.size()) {
                        // copy state to the child slots
-                        for (auto & child : child_slots) {
-                            SLT_INF(slot, "copying state to child %d\n", child->id);
+                        for (auto & child : children) {
+                            SLT_INF(slot, " - copying state to child %d\n", child->id);
+
+                            GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
+
                            slot.copy_state_to(*child);
                            child->state = SLOT_STATE_DONE_PROMPT;
                        }
                    }
                }
+            }

+            for (auto & slot : slots) {
                // optionally send prompt processing progress
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
                    if (slot.task->params.stream && slot.task->params.return_progress) {
@ -2687,6 +2745,9 @@ private:

                common_sampler_accept(slot.smpl.get(), id, true);

+                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
+                const int64_t t_current = ggml_time_us();
+
                slot.n_decoded += 1;

                if (slot.n_decoded == 1) {
@ -2723,13 +2784,15 @@ private:
                    continue;
                }

-                size_t n_draft = slot.drafted.size();
+                const size_t n_draft = slot.drafted.size();

                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
                slot.i_batch_dft.clear();
                slot.drafted.clear();

+                const int64_t t_current = ggml_time_us();
+
                slot.n_decoded += ids.size();

                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
@ -2924,17 +2987,25 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            task.params.oaicompat_cmpl_id = completion_id;
            task.params.oaicompat_model   = meta->model_name;

+            // prepare child tasks
            if (task.params.n_cmpl > 1) {
                task.n_children = task.params.n_cmpl - 1;
-                for (size_t j = 0; j < task.n_children; j++) {
-                    server_task child = task.create_child(
-                        task.id,
-                        rd.get_new_id());
+
+                for (int j = 0; j < task.n_children; j++) {
+                    server_task child = task.create_child(task.id, rd.get_new_id());
+
+                    // use different sampling seed for each child
+                    // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
+                    if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
+                        child.params.sampling.seed += j + 1;
+                    }
+
                    tasks.push_back(std::move(child));
                }
            }

-            tasks.push_back(std::move(task));
+            // note: the parent task always launches first
+            tasks.insert(tasks.begin(), std::move(task));
        }

        rd.post_tasks(std::move(tasks));
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -121,8 +121,8 @@ struct server_task {
    int id_slot   = -1;

    // used by parallel sampling (multiple completions from same prompt)
-    size_t n_children =  0; // number of tasks reusing this prompt
-    int    id_parent  = -1;
+    int n_children =  0; // number of tasks reusing this prompt
+    int id_parent  = -1;

    // used by SERVER_TASK_TYPE_INFERENCE
    task_params   params;
@ -173,11 +173,13 @@ struct server_task {

    server_task create_child(int id_parent, int id_child) const {
        server_task copy;
+
        copy.id        = id_child;
        copy.id_parent = id_parent;
        copy.params    = params;
        copy.type      = type;
        copy.tokens    = tokens.clone();
+
        return copy;
    }

--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@ -503,5 +503,4 @@ def test_chat_completions_multiple_choices():
    assert len(res.body["choices"]) == 2
    for choice in res.body["choices"]:
        assert "assistant" == choice["message"]["role"]
-        assert match_regex("Suddenly", choice["message"]["content"])
        assert choice["finish_reason"] == "length"
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
        if expect_ok:
            assert res.status_code == 200
+
+        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
+        if res.status_code == 200:
            assert "content" in res.body
            if "timings" in res.body:
                assert res.body["timings"]["predicted_n"] == n_predict
-        else:
-            assert res.status_code == 500
-            assert "content" not in res.body


@pytest.mark.parametrize(
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@ -10,21 +10,11 @@
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
 	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
 	import { config } from '$lib/stores/settings.svelte';
-	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+	import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isRouterMode } from '$lib/stores/server.svelte';
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { activeMessages } from '$lib/stores/conversations.svelte';
-	import {
-		FileTypeCategory,
-		MimeTypeApplication,
-		FileExtensionAudio,
-		FileExtensionImage,
-		FileExtensionPdf,
-		FileExtensionText,
-		MimeTypeAudio,
-		MimeTypeImage,
-		MimeTypeText
-	} from '$lib/enums';
+	import { MimeTypeText } from '$lib/enums';
 	import { isIMEComposing, parseClipboardContent } from '$lib/utils';
 	import {
 		AudioRecorder,
@ -61,7 +51,6 @@
 	let audioRecorder: AudioRecorder | undefined;
 	let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
 	let currentConfig = $derived(config());
-	let fileAcceptString = $state<string | undefined>(undefined);
 	let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
 	let isRecording = $state(false);
 	let message = $state('');
@ -104,40 +93,6 @@
 		return null;
 	});

-	// State for model props reactivity
-	let modelPropsVersion = $state(0);
-
-	// Fetch model props when active model changes (works for both MODEL and ROUTER mode)
-	$effect(() => {
-		if (activeModelId) {
-			const cached = modelsStore.getModelProps(activeModelId);
-			if (!cached) {
-				modelsStore.fetchModelProps(activeModelId).then(() => {
-					modelPropsVersion++;
-				});
-			}
-		}
-	});
-
-	// Derive modalities from active model (works for both MODEL and ROUTER mode)
-	let hasAudioModality = $derived.by(() => {
-		if (activeModelId) {
-			void modelPropsVersion; // Trigger reactivity on props fetch
-			return modelsStore.modelSupportsAudio(activeModelId);
-		}
-
-		return false;
-	});
-
-	let hasVisionModality = $derived.by(() => {
-		if (activeModelId) {
-			void modelPropsVersion; // Trigger reactivity on props fetch
-			return modelsStore.modelSupportsVision(activeModelId);
-		}
-
-		return false;
-	});
-
 	function checkModelSelected(): boolean {
 		if (!hasModelSelected) {
 			// Open the model selector
@ -148,42 +103,12 @@
 		return true;
 	}

-	function getAcceptStringForFileType(fileType: FileTypeCategory): string {
-		switch (fileType) {
-			case FileTypeCategory.IMAGE:
-				return [...Object.values(FileExtensionImage), ...Object.values(MimeTypeImage)].join(',');
-
-			case FileTypeCategory.AUDIO:
-				return [...Object.values(FileExtensionAudio), ...Object.values(MimeTypeAudio)].join(',');
-
-			case FileTypeCategory.PDF:
-				return [...Object.values(FileExtensionPdf), ...Object.values(MimeTypeApplication)].join(
-					','
-				);
-
-			case FileTypeCategory.TEXT:
-				return [...Object.values(FileExtensionText), MimeTypeText.PLAIN].join(',');
-
-			default:
-				return '';
-		}
-	}
-
 	function handleFileSelect(files: File[]) {
 		onFileUpload?.(files);
 	}

-	function handleFileUpload(fileType?: FileTypeCategory) {
-		if (fileType) {
-			fileAcceptString = getAcceptStringForFileType(fileType);
-		} else {
-			fileAcceptString = undefined;
-		}
-
-		// Use setTimeout to ensure the accept attribute is applied before opening dialog
-		setTimeout(() => {
-			fileInputRef?.click();
-		}, 10);
+	function handleFileUpload() {
+		fileInputRef?.click();
 	}

 	async function handleKeydown(event: KeyboardEvent) {
@ -343,13 +268,7 @@
 	});
 </script>

-<ChatFormFileInputInvisible
-	bind:this={fileInputRef}
-	bind:accept={fileAcceptString}
-	{hasAudioModality}
-	{hasVisionModality}
-	onFileSelect={handleFileSelect}
-/>
+<ChatFormFileInputInvisible bind:this={fileInputRef} onFileSelect={handleFileSelect} />

 <form
 	onsubmit={handleSubmit}
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
@ -4,14 +4,13 @@
 	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { FILE_TYPE_ICONS } from '$lib/constants/icons';
-	import { FileTypeCategory } from '$lib/enums';

 	interface Props {
 		class?: string;
 		disabled?: boolean;
 		hasAudioModality?: boolean;
 		hasVisionModality?: boolean;
-		onFileUpload?: (fileType?: FileTypeCategory) => void;
+		onFileUpload?: () => void;
 	}

 	let {
@ -27,10 +26,6 @@
 			? 'Text files and PDFs supported. Images, audio, and video require vision models.'
 			: 'Attach files';
 	});
-
-	function handleFileUpload(fileType?: FileTypeCategory) {
-		onFileUpload?.(fileType);
-	}
 </script>

 <div class="flex items-center gap-1 {className}">
@ -61,7 +56,7 @@
 					<DropdownMenu.Item
 						class="images-button flex cursor-pointer items-center gap-2"
 						disabled={!hasVisionModality}
-						onclick={() => handleFileUpload(FileTypeCategory.IMAGE)}
+						onclick={() => onFileUpload?.()}
 					>
 						<FILE_TYPE_ICONS.image class="h-4 w-4" />

@ -81,7 +76,7 @@
 					<DropdownMenu.Item
 						class="audio-button flex cursor-pointer items-center gap-2"
 						disabled={!hasAudioModality}
-						onclick={() => handleFileUpload(FileTypeCategory.AUDIO)}
+						onclick={() => onFileUpload?.()}
 					>
 						<FILE_TYPE_ICONS.audio class="h-4 w-4" />

@ -98,7 +93,7 @@

 			<DropdownMenu.Item
 				class="flex cursor-pointer items-center gap-2"
-				onclick={() => handleFileUpload(FileTypeCategory.TEXT)}
+				onclick={() => onFileUpload?.()}
 			>
 				<FILE_TYPE_ICONS.text class="h-4 w-4" />

@ -109,7 +104,7 @@
 				<Tooltip.Trigger class="w-full">
 					<DropdownMenu.Item
 						class="flex cursor-pointer items-center gap-2"
-						onclick={() => handleFileUpload(FileTypeCategory.PDF)}
+						onclick={() => onFileUpload?.()}
 					>
 						<FILE_TYPE_ICONS.pdf class="h-4 w-4" />

--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@ -24,7 +24,7 @@
 		isRecording?: boolean;
 		hasText?: boolean;
 		uploadedFiles?: ChatUploadedFile[];
-		onFileUpload?: (fileType?: FileTypeCategory) => void;
+		onFileUpload?: () => void;
 		onMicClick?: () => void;
 		onStop?: () => void;
 	}
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
@ -1,35 +1,14 @@
 <script lang="ts">
-	import { generateModalityAwareAcceptString } from '$lib/utils';
-
 	interface Props {
-		accept?: string;
 		class?: string;
-		hasAudioModality?: boolean;
-		hasVisionModality?: boolean;
 		multiple?: boolean;
 		onFileSelect?: (files: File[]) => void;
 	}

-	let {
-		accept = $bindable(),
-		class: className = '',
-		hasAudioModality = false,
-		hasVisionModality = false,
-		multiple = true,
-		onFileSelect
-	}: Props = $props();
+	let { class: className = '', multiple = true, onFileSelect }: Props = $props();

 	let fileInputElement: HTMLInputElement | undefined;

-	// Use modality-aware accept string by default, but allow override
-	let finalAccept = $derived(
-		accept ??
-			generateModalityAwareAcceptString({
-				hasVision: hasVisionModality,
-				hasAudio: hasAudioModality
-			})
-	);
-
 	export function click() {
 		fileInputElement?.click();
 	}
@ -46,7 +25,6 @@
 	bind:this={fileInputElement}
 	type="file"
 	{multiple}
-	accept={finalAccept}
 	onchange={handleFileSelect}
 	class="hidden {className}"
 />
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xuan-Son Nguyen	506bb6e010	model: try to improve Qwen3 Next (#18683 ) * qwen3next: simplify qkvz projection * use ggml_swiglu_split * revert swiglu_split, but remove redundant repeat() * fix missing reshape * rm 2 redundant transposes * move mul_mat(k,q) to outside of chunking * rm redundant cont * improve g_cs_chunk * add comments about no cont * use std::pair instead of ggml_concat * vectorize key_gdiff calculation * rm unused tensor * avoid ggml_concat inside loop * bring back ggml_concat as it may not work on other backend * nits	2026-01-11 12:53:33 +01:00
thom-dev-fr	79456a690a	readme : update UIs (#18751 )	2026-01-11 13:46:50 +02:00
Xuan-Son Nguyen	28068af789	security: narrow down the scope of what we consider a vulnerability (#18752 ) * security: narrow down the scope of what we consider a vulnerability * fix typo	2026-01-11 12:23:36 +01:00
shaofeiqi	707cbafcaa	opencl: add SOFTPLUS op support (#18726 )	2026-01-10 21:57:44 -08:00
Aman Gupta	b137718878	test-backend-ops: fix mxfp4 tests on blackwell (#18736 )	2026-01-11 01:12:57 +08:00
Johannes Gäßler	d2ff4e23ac	HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666 )	2026-01-10 17:19:01 +01:00
Perry Naseck	657a2e644b	cmake : update blas logic (#18205 )	2026-01-10 18:00:54 +02:00
Georgi Gerganov	f307926482	server : adjust unified KV cache tests (#18716 )	2026-01-10 17:51:56 +02:00
Sigbjørn Skjæret	7fdc8c893d	scripts : follow api redirects in pr2wt.sh (#18739 )	2026-01-10 16:04:05 +01:00
Xuan-Son Nguyen	23f82f2420	preset: allow named remote preset (#18728 ) * preset: allow named remote preset * nits: fix docs * cont docs	2026-01-10 15:12:29 +01:00
Aaron Teo	2656c0d265	docs(ggml): update backend ops (#18734 ) Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2026-01-10 18:48:17 +08:00
Michael Wand	600a366478	Corrected: changed s13 = src1->nb[3] instead of nb[2] (#18724 )	2026-01-10 10:16:07 +01:00
Adrien Gallouët	ea23c15990	common : add --license to display embedded licenses (#18696 ) This commit introduces a mechanism to embed all licenses directly into the compiled binaries. This eliminates the need to distribute separate LICENSE files alongside the executable, making the binaries self-contained and simplifying deployment.	2026-01-10 09:46:24 +01:00
Xuan-Son Nguyen	9ac2693a30	server: fix n_cmpl not skipping processing prompt (#18663 ) * server: fix n_cmpl not skipping processing * fix infinite loop on empty batch * cont : init child samplers + modify child logic * cont : cleanup * cont : improve n_cmpl logic - launch the parent task first so it finds the slot with best cache - parent task waits for child tasks to be launched - when a child task finishes - remove its cache * cont : remove redundant function * cont : reduce parent checks * fix : nullptr task dereference --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-01-10 00:00:41 +01:00
Simranjeet Singh	a61c8bc3bf	mtmd: Add Gemma3n multimodal support with MobileNetV5 vision encoder (#18256 ) * Add Gemma3nVisionModel - MobileNetV5 vision encoder convertor to convert_hf_to_gguf.py. Add gemma3n to vision projectors in gguf-py/gguf/constants.py. * Add mobilenetv5 impl * Fix comments, remove unused vars * Fix permute and remove transpose of projection weights * Fix comments, remove debugging prints from hf_to_gguf * 1. Hard-code image_mean = 0 and image_std = 1 2. Use available tensor mapping logic 3. Remove redundant chat template replacement of soft tokens placeholder with media placeholder * 1. Move mobilenetv5 helpers declarations to `clip_graph_mobilenetv5` struct and definitions to mobilenetv5.cpp 2.Remove unused `clip_is_gemma3n` func declarations and definitions 3. Remove redundant `rescale_image_u8_to_f32` func and use `normalize_image_u8_to_f32` with zero mean and unit std 4. Calculate n_patches using image_size / patch_size * Remove obsolete comments * - convert_hf_to_gguf.py & constants.py & tensor_mapping.py: Use explicit mapping: Custom map for double indexed blocks and tensor_mapping.py for rest - convert_hf_to_gguf.py: Unsqueeze Stem Bias and Layer scale tensors to correct shape while converting to gguf - mobilenetv5.cpp: Remove explicit reshaping of Stem Bias and Layer scale which are now handled while converting to gguf, replace fprintf with LOG_* - clip.cpp: Remove unused embedding and hard_emb_norm tensor loading * - Rename tensors to v.conv..., v.blk..., v.msfa... to better align with already existing terminology * Fix stem conv bias name * Remove explicit handling of bias term for stem conv * - Change order of addition in "project_per_layer_inputs" to support broadcasting of vision inp_per_layer - Simplify the vision embeddings path of "get_per_layer_inputs" to output [n_embd_altup, n_layer, 1], broadcastable * clean up conversion script * fix code style * also preserve audio tensors * trailing space * split arch A and V * rm unused gemma3 func * fix alignment --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2026-01-09 23:42:38 +01:00
shaofeiqi	593da7fa49	opencl: add EXPM1 op (#18704 )	2026-01-09 10:13:13 -08:00
Reese Levine	9e41884dce	Updates to webgpu get_memory (#18707 )	2026-01-09 08:17:18 -08:00
Pascal	ec8fd7876b	Webui/file upload (#18694 ) * webui: fix restrictive file type validation * webui: simplify file processing logic * chore: update webui build output * webui: remove file picker extension whitelist (1/2) * webui: remove file picker extension whitelist (2/2) * chore: update webui build output * refactor: Cleanup * chore: update webui build output * fix: update ChatForm storybook test after removing accept attribute * chore: update webui build output * refactor: more cleanup * chore: update webui build output	2026-01-09 16:45:32 +01:00
Asbjørn Olling	a180ba78c7	cmake: only build cli when server is enabled (#18670 )	2026-01-09 16:43:26 +01:00
Georgi Gerganov	53eb9435da	server : fix timing of prompt/generation (#18713 )	2026-01-09 12:59:50 +02:00
Georgi Gerganov	d3435efc8a	scripts : pr2wt.sh reset to remote head (#18695 ) * scripts : pr2wt.sh reset to remote head * cont : cleaner * cont : restore --set-upstream-to	2026-01-09 12:16:40 +02:00
Georgi Gerganov	f5f8812f7c	server : use different seeds for child completions (#18700 ) * server : use different seeds for child completions * cont : handle default seed * cont : note	2026-01-09 09:33:50 +02:00
Xuan-Son Nguyen	8ece3836b4	common: support remote preset (#18520 ) * arg: support remote preset * proof reading * allow one HF repo to point to multiple HF repos * docs: mention about multiple GGUF use case * correct clean_file_name * download: also return HTTP status code * fix case with cache file used * fix --offline option	2026-01-08 22:35:40 +01:00
Aaron Teo	046d5fd44e	llama: use host memory if device reports 0 memory (#18587 )	2026-01-09 05:34:56 +08:00
Masashi Yoshimura	480160d472	ggml-webgpu: Fix GGML_MEM_ALIGN to 8 for emscripten. (#18628 ) * Fix GGML_MEM_ALIGN to 8 for emscripten. * Add a comment explaining the need for GGML_MEM_ALIGN == 8 in 64-bit wasm with emscripten	2026-01-08 08:36:42 -08:00
Reese Levine	15bff84bf5	ggml webgpu: initial flashattention implementation (#18610 ) * FlashAttention (#13) * Add inplace softmax * Move rms_norm to split row approach * Update debug for supports_op * clean up debug statements * neg f16xf32xip builds and runs, havent actually ran a model that uses neg kernel yet though * neg passes backend test * unary operators pass ggml tests * rms_norm double declaration bug atoned * abides by editor-config * removed vestigial files * fixed autoconfig * All operators (inlcluding xielu) working * removed unnecesarry checking if node->src[1] exists for unary operators * responded and dealt with PR comments * implemented REPL_Template support and removed bug in unary operators kernel * formatted embed wgsl and ggml-webgpu.cpp * Faster tensors (#8) Add fast matrix and matrix/vector multiplication. * Use map for shader replacements instead of pair of strings * Wasm (#9) * webgpu : fix build on emscripten * more debugging stuff * test-backend-ops: force single thread on wasm * fix single-thread case for init_tensor_uniform * use jspi * add pthread * test: remember to set n_thread for cpu backend * Add buffer label and enable dawn-specific toggles to turn off some checks * Intermediate state * Fast working f16/f32 vec4 * Working float fast mul mat * Clean up naming of mul_mat to match logical model, start work on q mul_mat * Setup for subgroup matrix mat mul * Basic working subgroup matrix * Working subgroup matrix tiling * Handle weirder sg matrix sizes (but still % sg matrix size) * Working start to gemv * working f16 accumulation with shared memory staging * Print out available subgroup matrix configurations * Vectorize dst stores for sg matrix shader * Gemv working scalar * Minor set_rows optimization (#4) * updated optimization, fixed errors * non vectorized version now dispatches one thread per element * Simplify * Change logic for set_rows pipelines --------- Co-authored-by: Neha Abbas <nehaabbas@macbookpro.lan> Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local> Co-authored-by: Reese Levine <reeselevine1@gmail.com> * Comment on dawn toggles * Working subgroup matrix code for (semi)generic sizes * Remove some comments * Cleanup code * Update dawn version and move to portable subgroup size * Try to fix new dawn release * Update subgroup size comment * Only check for subgroup matrix configs if they are supported * Add toggles for subgroup matrix/f16 support on nvidia+vulkan * Make row/col naming consistent * Refactor shared memory loading * Move sg matrix stores to correct file * Working q4_0 * Formatting * Work with emscripten builds * Fix test-backend-ops emscripten for f16/quantized types * Use emscripten memory64 to support get_memory * Add build flags and try ci --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> * Remove extra whitespace * Move wasm single-thread logic out of test-backend-ops for cpu backend * Disable multiple threads for emscripten single-thread builds in ggml_graph_plan * Refactored pipelines and workgroup calculations (#10) * refactored pipelines * refactored workgroup calculation * removed commented out block of prior maps * Clean up ceiling division pattern --------- Co-authored-by: Neha Abbas <nehaabbas@eduroam-169-233-141-223.ucsc.edu> Co-authored-by: Reese Levine <reeselevine1@gmail.com> * Start work on flash attention * Shader structure set up (many bugs still) * debugging * Working first test * Working with head grouping, head sizes to 128, logit softcap, mask/sinks enabled, f32 * Generalize softmax to work with multiple subgroups, f16 accumulation, mask shared memory tiling * Start work on integrating pre-wgsl * Separate structs/initial shader compilation library into separate files * Work on compilation choices for flashattention * Work on subgroup matrix/tile size portability * subgroup size agnostic online softmax * Cleanups, quantization types * more cleanup * fix wasm build * Refactor flashattention to increase parallelism, use direct loads for KV in somce cases * Checkpoint * formatting * Update to account for default kv cache padding * formatting shader * Add workflow for ggml-ci webgpu * Try passing absolute path to dawn in ggml-ci * Avoid error on device destruction, add todos for proper cleanup * Fix unused warning * Forgot one parameter unused * Move some flashattn computation to f32 for correctness	2026-01-08 08:23:39 -08:00
Jeff Bolz	2524c26164	vulkan: fix push constant size for quantize_q8_1 (#18687 ) I added an assert to catch further mismatches, and it found several. Fix those, too.	2026-01-08 15:40:58 +01:00
Jeff Bolz	cb14b06995	vulkan: optimize ssm_scan (#18630 ) * vulkan: optimize ssm_scan * fix warp vs subgroup naming	2026-01-08 15:16:54 +01:00
Adrien Gallouët	55abc39355	vendor : update cpp-httplib to 0.30.0 (#18660 ) * vendor : update cpp-httplib to 0.30.0 * common : allow custom headers when downloading	2026-01-08 13:53:54 +01:00
Georgi Gerganov	f2f6c88067	scripts : support chaining commands in pr2wt.sh (#18671 )	2026-01-08 13:40:23 +02:00
도로로도로또	945bf10627	metal : add MoE kernel specialization for ne20=5 (#18667 ) Add template specialization for kernel_mul_mm_id_map0 with ne20=5 to support models using 5 active experts (e.g., VAETKI).	2026-01-08 12:37:45 +02:00
Johannes Gäßler	64848deb18	llama-fit-params: free memory target per device (#18679 )	2026-01-08 10:07:58 +01:00
Doctor Shotgun	9a5724dee2	ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH (#18535 ) * ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH * makes the min_batch_size for triggering op offload configurable via env var, defaulting to the prior hardcoded value of 32 * ggml: read GGML_OP_OFFLOAD_MIN_BATCH once and store to dev ctx * cann: forward declaration of device context struct * cann: move offload op check after device context declaration * cuda: fix whitespace Co-authored-by: Aman Gupta <amangupta052@gmail.com> --------- Co-authored-by: Aman Gupta <amangupta052@gmail.com>	2026-01-08 11:03:21 +02:00
Daniel Bevenius	9c142e3a2a	model-conversion : add warn about transformers mismatch (#18691 ) This commit adds a check comparing the installed transformers library with the transformers version that the original model supports. This check will be performed upon a model verification failure and prints a warning/hint to the user suggesting to install the correct version of the transformers library. The motivation for this change is that it is possible for the model verification to fail due to differences in the transformers library used and it might not be obvious that this could be the cause of the failure. With this warning the correct version can be checked and hopefully save time troubleshooting the cause of the verification failure.	2026-01-08 09:29:53 +01:00
Daniel Bevenius	df7fb92170	model-conversion : remove -st targets for converted model (#18689 ) This commit removes the '-st` make target for running the converted embedding model. The motivation for this is that the pooling type is now part of the .gguf metdata of the model and this is used by llama-debug when running the model. So there is no need to specify the pooling type separately any more. The commit also adds an option to specify the type of normalization applied to the output embeddings when running the converted model. And the readme documentation has been updated to reflect these changes.	2026-01-08 09:29:15 +01:00
Julius Tischbein	2038101bd9	llama : add `use_direct_io` flag for model loading (#18166 ) * Adding --direct-io flag for model loading * Fixing read_raw() calls * Fixing Windows read_raw_at * Changing type off_t to size_t for windows and Renaming functions * disable direct io when mmap is explicitly enabled * Use read_raw_unsafe when upload_backend is available, not functional on some devices with Vulkan and SYCL * Fallback to std::fread in case O_DIRECT fails due to bad address * Windows: remove const keywords and unused functions * Update src/llama-mmap.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: jtischbein <jtischbein@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-01-08 08:35:30 +02:00
shaofeiqi	568371a726	opencl: add FILL op support (#18682 )	2026-01-07 22:04:50 -08:00
Sigbjørn Skjæret	5b8844ae53	scripts : fix repos cloned with .git extension (#18669 )	2026-01-07 22:35:34 +01:00
Sigbjørn Skjæret	7e16fef085	convert : more variants of rope_theta config entries (#18668 )	2026-01-07 22:34:51 +01:00
Oliver Walsh	f5245b5e4e	cuda : fix build on cuda 12.8 (#18672 ) compute121 requires 12.9 Signed-off-by: Oliver Walsh <owalsh@redhat.com>	2026-01-07 22:32:44 +01:00
R	ae9f8df778	fix(docker): add missing libglvnd libraries to Vulkan image (#18664 ) Add libglvnd0, libgl1, libglx0, libegl1, libgles2 to the Vulkan Dockerfile base image. These libraries are required by mesa-vulkan-drivers to properly initialize the Vulkan ICD and detect GPU devices. Without these libraries, vkEnumeratePhysicalDevices() returns an empty list, resulting in "ggml_vulkan: No devices found." error. Fixes #17761	2026-01-07 16:57:42 +01:00
Adrien Gallouët	56d2fed2b3	tools : remove llama-run (#18661 ) * tools : remove llama-run * Remove licenses/LICENSE-linenoise Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-01-07 16:18:26 +01:00
Georgi Gerganov	56426673cb	scripts : add pr2wt.sh (#18644 ) * scripts : add pr2wt.sh * script : shebang Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-01-07 15:16:20 +02:00
Daniel Bevenius	bb77764c2d	convert : clarify sentence-transformers-dense-modules help [no ci] (#18662 ) * convert : clarify sentence-transformers-dense-modules help [no ci] This commit updates this options help message which currently looks like this: ```console --sentence-transformers-dense-modules Whether to include sentence-transformers dense modules.It can be used for sentence-transformers models, like google/embeddinggemma-300mDefault these modules are not included. ```	2026-01-07 13:18:53 +01:00
Sigbjørn Skjæret	9dfa8ee950	ci : run cann build unconditionally [no ci] (#18659 )	2026-01-07 13:07:08 +01:00
Jeff Bolz	ca4a8370bc	vulkan: reject ops when a tensor is too large to allocate (#18646 )	2026-01-07 12:03:32 +01:00
virajwad	03023296cf	vulkan: Warptile tuning for Intel Xe2/Xe3 (#18178 ) * modify warptile tuning for xe3 * intel vendor check w/ coopmat support * fix back formatting * fix formatting change 2 * move intel check to chip specific tuning part * Change to support both windows and linux * modify m_warptile to l_warptile for intel * modify warptile tuning for bf16 matmuls to fix regression (m_warptile to l_warptile) * Code style changes * Code style changes (2) * Code style changes (3)	2026-01-07 11:59:47 +01:00