132 changed files with 1309 additions and 7523 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1098,7 +1098,6 @@ jobs:
            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
        - name: Build with CMake
          # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
          run: |
            cmake -S . -B build -G Ninja \
              -DLLAMA_CURL=OFF \
@ -1108,8 +1107,7 @@ jobs:
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON \
+              -DGGML_CUDA=ON
              -DGGML_CUDA_CUB_3DOT2=ON
            cmake --build build
  windows-2022-cmake-cuda:
@ -1145,7 +1143,6 @@ jobs:
      - name: Build
        id: cmake_build
        shell: cmd
        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
@ -1156,8 +1153,7 @@ jobs:
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=ON ^
            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
+            -DGGML_RPC=ON
            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
@ -1754,7 +1750,7 @@ jobs:
          sudo apt-get update
          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1766,8 +1762,6 @@ jobs:
          rustup install stable
          rustup default stable
          git lfs install
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
@ -1853,7 +1847,7 @@ jobs:
          sudo apt-get update
          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1865,8 +1859,6 @@ jobs:
          rustup install stable
          rustup default stable
          git lfs install
      - name: GCC version check
        run: |
          gcc --version
@ -1947,7 +1939,7 @@ jobs:
          sudo apt-get update
          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1959,8 +1951,6 @@ jobs:
          rustup install stable
          rustup default stable
          git lfs install
      - name: GCC version check
        run: |
          gcc --version
@ -2021,7 +2011,7 @@ jobs:
          sudo apt-get update
          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -2033,8 +2023,6 @@ jobs:
          rustup install stable
          rustup default stable
          git lfs install
      - name: GCC version check
        run: |
          gcc --version
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -420,7 +420,6 @@ jobs:
      - name: Build
        id: cmake_build
        shell: cmd
        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
@ -428,8 +427,7 @@ jobs:
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF ^
+            -DLLAMA_CURL=OFF
            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -41,10 +41,6 @@ jobs:
        include:
          - build_type: Release
            sanitizer: ""
            extra_args: ""
          - build_type: Release
            sanitizer: ""
            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    steps:
@ -69,12 +65,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
@ -86,14 +76,6 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt
      - name: Tests
        id: server_integration_tests
        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
        run: |
          cd tools/server/tests
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"
  server-windows:
    runs-on: windows-2022
--- a/ci/run.sh
+++ b/ci/run.sh
@ -52,8 +52,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
    if command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1695,13 +1695,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
        }
    ).set_sparam());
    add_opt(common_arg(
        {"-bs", "--backend-sampling"},
        "enable backend sampling (experimental) (default: disabled)",
        [](common_params & params) {
            params.sampling.backend_sampling = true;
        }
    ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
    add_opt(common_arg(
        {"--pooling"}, "{none,mean,cls,last,rank}",
        "pooling type for embeddings, use model default if unspecified",
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@ -1395,14 +1395,6 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
 }
 static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
    builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
    // TODO: Tool calling
    builder.add_content(builder.consume_rest());
 }
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
    builder.try_parse_reasoning("<think>", "</think>");
    builder.add_content(builder.consume_rest());
@ -1487,9 +1479,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
            common_chat_parse_xiaomi_mimo(builder);
            break;
        case COMMON_CHAT_FORMAT_SOLAR_OPEN:
            common_chat_parse_solar_open(builder);
            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -380,8 +380,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
                const auto & function = tool.at("function");
                result.push_back({
                    /* .name = */ function.at("name"),
-                    /* .description = */ function.value("description", ""),
+                    /* .description = */ function.at("description"),
-                    /* .parameters = */ function.value("parameters", json::object()).dump(),
+                    /* .parameters = */ function.at("parameters").dump(),
                });
            }
        }
@ -669,7 +669,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
        case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
        case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
        case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
        case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@ -2065,7 +2064,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
            // Trigger on tool calls that appear in the commentary channel
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<\\|channel\\|>(?:commentary|analysis) to"
+                "<\\|channel\\|>(commentary|analysis) to"
            });
            // Trigger tool calls that appear in the role section, either at the
@ -2398,17 +2397,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
                (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
            // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
                // If thinking_forced_open, then we capture the </think> tag in the grammar,
                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
-                std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
                    "\\s*("
                    "(?:<tool_call>"
                    "|<function"
                    "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
                    "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
                    ")"
-                    ")"
+                    ")[\\s\\S]*"
                ),
            });
            data.preserved_tokens = {
@ -2518,27 +2517,6 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
    return data;
 }
 static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    // TODO: Reasoning effort
    json additional_context = {};
    data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
    data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
    data.preserved_tokens = {
        "<|think|>",
        "<|content|>",
        "<|begin|>",
        "<|end|>",
    };
    // TODO: Tool calling
    return data;
 }
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@ -2802,13 +2780,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_magistral(tmpl, params);
    }
    // Solar Open
    if (src.find("<|tool_response:begin|>") != std::string::npos &&
        src.find("<|tool_response:name|>") != std::string::npos &&
        src.find("<|tool_response:result|>") != std::string::npos) {
        return common_chat_params_init_solar_open(tmpl, params);
    }
    // Plain handler (no tools)
    if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
        return common_chat_params_init_without_tools(tmpl, params);
--- a/common/chat.h
+++ b/common/chat.h
@ -124,7 +124,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
    COMMON_CHAT_FORMAT_SOLAR_OPEN,
    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1086,7 +1086,6 @@ struct common_init_result::impl {
    std::vector<llama_adapter_lora_ptr> lora;
    std::vector<common_sampler_ptr> samplers;
    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };
 common_init_result::common_init_result(common_params & params) :
@ -1163,19 +1162,10 @@ common_init_result::common_init_result(common_params & params) :
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}
    // init the backend samplers as part of the context creation
    pimpl->samplers.resize(cparams.n_seq_max);
    pimpl->samplers_seq_config.resize(cparams.n_seq_max);
    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
    }
    // TODO: temporarily gated behind a flag
    if (params.sampling.backend_sampling) {
        cparams.samplers   = pimpl->samplers_seq_config.data();
        cparams.n_samplers = pimpl->samplers_seq_config.size();
    }
    llama_context * lctx = llama_init_from_model(model, cparams);
@ -1199,12 +1189,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
    return pimpl->samplers[seq_id].get();
 }
 void common_init_result::reset_samplers() {
    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
    }
 }
 std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }
@ -1320,9 +1304,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
        llama_set_warmup(lctx, false);
        // reset samplers to reset RNG state after warmup to the seeded state
        res->reset_samplers();
    }
    return res;
--- a/common/common.h
+++ b/common/common.h
@ -216,8 +216,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
    bool backend_sampling = false;
    bool has_logit_bias() const {
        return !logit_bias.empty();
    }
@ -691,9 +689,7 @@ struct common_init_result {
    llama_model * model();
    llama_context * context();
    common_sampler * sampler(llama_seq_id seq_id);
    void reset_samplers();
    std::vector<llama_adapter_lora_ptr> & lora();
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@ -112,10 +112,6 @@ static llama_sampler_i llama_sampler_llg_i = {
    /* .reset  = */ llama_sampler_llg_reset,
    /* .clone  = */ llama_sampler_llg_clone,
    /* .free   = */ llama_sampler_llg_free,
    /* .backend_init      = */ NULL,
    /* .backend_accept    = */ NULL,
    /* .backend_apply     = */ NULL,
    /* .backend_set_input = */ NULL,
 };
 static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
--- a/common/regex-partial.cpp
+++ b/common/regex-partial.cpp
@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
        return res;
    }
    std::match_results<std::string::const_reverse_iterator> srmatch;
-    if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
+    if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
        auto group = srmatch[1].str();
        if (group.length() != 0) {
            auto it = srmatch[1].second.base();
@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
-  - /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
+  - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
-  - /a|b/ -> ^(a|b)
+  - /a|b/ -> (a|b).*
  - /a*?/ -> error, could match ""
-  - /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
+  - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
-  - /.*?ab/ -> ^((?:b)?a) (omit .*)
+  - /.*?ab/ -> ((?:b)?a).* (merge .*)
-  - /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
+  - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
-  - /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
+  - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
-  - /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
+  - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
-  - /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
+  - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
-  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
+  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
-  All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
+  (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
 */
 std::string regex_to_reversed_partial_regex(const std::string & pattern) {
    auto it = pattern.begin();
@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
            }
        }
-        // /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
+        // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
        // We'll do the outermost capturing group and final .* in the enclosing function.
        std::vector<std::string> res_alts;
@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
        throw std::runtime_error("Unmatched '(' in pattern");
    }
-    return "^(" + res + ")";
+    return "(" + res + ")[\\s\\S]*";
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -120,35 +120,18 @@ struct common_sampler {
    }
    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_sampled_probs_ith     (ctx, idx);
+        const auto * logits = llama_get_logits_ith(ctx, idx);
        const float *       sampled_logits = llama_get_sampled_logits_ith    (ctx, idx);
        const llama_token * sampled_ids    = llama_get_sampled_candidates_ith(ctx, idx);
        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);
        const int n_vocab = llama_vocab_n_tokens(vocab);
        if (sampled_probs) {
            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
            cur.resize(sampled_probs_count);
            for (uint32_t i = 0; i < sampled_probs_count; ++i) {
                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
            }
        } else if (sampled_logits) {
            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
            cur.resize(sampled_logits_count);
            for (uint32_t i = 0; i < sampled_logits_count; i++) {
                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
            }
        } else {
            const auto * logits = llama_get_logits_ith(ctx, idx);
            GGML_ASSERT(logits != nullptr);
        cur.resize(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }
        }
        cur_p = { cur.data(), cur.size(), -1, false };
    }
@ -176,7 +159,7 @@ std::string common_params_sampling::print() const {
    return std::string(result);
 }
-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    const llama_vocab * vocab = llama_model_get_vocab(model);
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
@ -196,30 +179,24 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
        std::vector<std::string> trigger_patterns;
        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
            switch (trigger.type) {
                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
                {
                    const auto & word = trigger.value;
-                    trigger_patterns.push_back(regex_escape(word));
+                    patterns_anywhere.push_back(regex_escape(word));
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
                {
-                    trigger_patterns.push_back(trigger.value);
+                    patterns_anywhere.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                {
-                    const auto & pattern = trigger.value;
+                    trigger_patterns.push_back(trigger.value);
                    std::string anchored = "^$";
                    if (!pattern.empty()) {
                        anchored = (pattern.front() != '^' ? "^" : "")
                            + pattern
                            + (pattern.back() != '$' ? "$" : "");
                    }
                    trigger_patterns.push_back(anchored);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@ -233,6 +210,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }
        if (!patterns_anywhere.empty()) {
            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
        }
        std::vector<const char *> trigger_patterns_c;
        trigger_patterns_c.reserve(trigger_patterns.size());
        for (const auto & regex : trigger_patterns) {
@ -315,12 +296,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        llama_sampler_chain_add(chain, smpl);
    }
    if (grmr && params.backend_sampling) {
        LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
        params.backend_sampling = false;
    }
    auto * result = new common_sampler {
        /* .params  = */ params,
        /* .grmr    = */ grmr,
@ -430,25 +405,6 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
        id = llama_get_sampled_token_ith(ctx, idx);
        if (id != LLAMA_TOKEN_NULL) {
            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
            // TODO: simplify
            gsmpl->cur.resize(1);
            gsmpl->cur[0] = { id, 0.0f, 1.0f };
            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
            return id;
        }
    }
    gsmpl->set_logits(ctx, idx);
    if (grammar_first) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -36,8 +36,7 @@ struct common_sampler;
 // llama_sampler API overloads
-// note: can mutate params in some cases
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
 void common_sampler_free(struct common_sampler * gsmpl);
@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
 // get the underlying llama_sampler_chain
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // extended sampling implementation:
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -771,14 +771,9 @@ class TextModel(ModelBase):
        self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
        rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
-            if local_rope_theta is not None:
+            if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
            if "rope_theta" not in self.rope_parameters and rope_theta is not None:
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
@ -844,7 +839,6 @@ class TextModel(ModelBase):
            self.gguf_writer.add_head_count_kv(n_head_kv)
            logger.info(f"gguf: key-value head count = {n_head_kv}")
        # TODO: Handle "sliding_attention" similarly when models start implementing it
        rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
        if (rope_type := rope_params.get("rope_type")) is not None:
            rope_factor = rope_params.get("factor")
@ -891,9 +885,6 @@ class TextModel(ModelBase):
        if (rope_theta := rope_params.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
            logger.info(f"gguf: rope theta = {rope_theta}")
        if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
            logger.info(f"gguf: rope theta swa = {local_rope_theta}")
        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
@ -1071,9 +1062,6 @@ class TextModel(ModelBase):
        if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
            # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
            res = "grok-2"
        if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
            # ref: https://huggingface.co/aari1995/German_Semantic_V3
            res = "jina-v2-de"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@ -1242,12 +1230,6 @@ class TextModel(ModelBase):
        if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
            # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
            res = "kormo"
        if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
            # ref: https://huggingface.co/tencent/Youtu-LLM-2B
            res = "youtu"
        if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
            # ref: https://huggingface.co/upstage/Solar-Open-100B
            res = "solar-open"
        if res is None:
            logger.warning("\n")
@ -2504,7 +2486,6 @@ class StableLMModel(TextModel):
    "VLlama3ForCausalLM",
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
    "IQuestCoderForCausalLM",
    "LlamaModel")
 class LlamaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@ -3522,7 +3503,7 @@ class QwenModel(TextModel):
        self._set_vocab_qwen()
-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
+@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
 class Qwen2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2
@ -5013,6 +4994,7 @@ class Plamo3Model(TextModel):
        if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
            self.gguf_writer.add_sliding_window(sliding_window)
            self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
            self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"])
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@ -5302,14 +5284,13 @@ class BertModel(TextModel):
        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
        # convert to phantom space vocab
-        def phantom(tok, toktype):
+        def phantom(tok):
-            if toktype == gguf.TokenType.CONTROL:
+            if tok.startswith("[") and tok.endswith("]"):
                return tok
            if tok.startswith("##"):
                return tok[2:]
            return "\u2581" + tok
-        assert len(tokens) == len(toktypes)
+        tokens = list(map(phantom, tokens))
        tokens = list(map(phantom, tokens, toktypes))
        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
@ -6423,17 +6404,6 @@ class ARwkv7Model(Rwkv7Model):
        self.gguf_writer.add_head_count(0)
@ModelBase.register("MaincoderForCausalLM")
 class MaincoderModel(TextModel):
    model_arch = gguf.MODEL_ARCH.MAINCODER
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        if (head_dim := self.hparams.get("head_dim")) is not None:
            self.gguf_writer.add_rope_dimension_count(head_dim)
@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
 class MambaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.MAMBA
@ -7211,8 +7181,6 @@ class DeepseekModel(TextModel):
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
    "KimiVLForConditionalGeneration",
    "YoutuForCausalLM",
    "YoutuVLForConditionalGeneration"
 )
 class DeepseekV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@ -7279,15 +7247,7 @@ class DeepseekV2Model(TextModel):
        super().set_gguf_parameters()
        hparams = self.hparams
-        # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
        # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
        has_moe = hparams.get("n_routed_experts") is not None
        first_k_dense_replace = hparams.get("first_k_dense_replace")
        if first_k_dense_replace is None:
            # Default: if no MoE, all layers are dense; if MoE, none are dense
            first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
        self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@ -7299,24 +7259,11 @@ class DeepseekV2Model(TextModel):
        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
-        # MoE parameters (required by C++ code for DEEPSEEK2 arch)
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
-        moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
-        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
-
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
        if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
            self.gguf_writer.add_expert_count(n_routed_experts)
        # expert_shared_count is required by C++ code, default to 0 for non-MoE models
        n_shared_experts = hparams.get("n_shared_experts", 0)
        self.gguf_writer.add_expert_shared_count(n_shared_experts)
        # When not set, C++ code will use scale_w = false to skip the no-op scaling
        if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
            self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
        if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
@ -7332,17 +7279,10 @@ class DeepseekV2Model(TextModel):
        # skip vision tensors and remove "language_model." for Kimi-VL
        if "vision_tower" in name or "multi_modal_projector" in name:
            return []
-        if name.startswith("siglip2.") or name.startswith("merger."):
+
            return []
        if name.startswith("language_model."):
            name = name.replace("language_model.", "")
        # skip lm_head.weight if tie_word_embeddings is True
        if self.hparams.get("tie_word_embeddings", False):
            if name == "lm_head.weight" or name == "model.lm_head.weight":
                logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
                return []
        # rename e_score_correction_bias tensors
        if name.endswith("e_score_correction_bias"):
            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@ -7489,6 +7429,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
        self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
        self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"])
        self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
@ -9351,19 +9292,6 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel):
        self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
 class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
    def tensor_force_quant(self, name, new_name, bid, n_dims):
        if ".conv" in name and ".weight" in name:
            # Was trained in BF16, being safe, avoiding quantizing to FP16
            return gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("FalconH1ForCausalLM")
 class FalconH1Model(Mamba2Model):
    model_arch = gguf.MODEL_ARCH.FALCON_H1
@ -9956,27 +9884,6 @@ class LFM2Model(TextModel):
        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
@ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
    model_arch = gguf.MODEL_ARCH.LFM2
    dense_tensor_name = "dense_2"
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith(self.dense_tensor_name):
            name = "model." + name
        return super().modify_tensors(data_torch, name, bid)
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        # dense tensor is stored in a separate safetensors file
        from safetensors.torch import load_file
        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
        assert tensors_file.is_file()
        tensor = load_file(tensors_file)["linear.weight"]
        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
        yield f"{self.dense_tensor_name}.weight", tensor.clone()
@ModelBase.register("Lfm2MoeForCausalLM")
 class LFM2MoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.LFM2MOE
@ -10247,6 +10154,7 @@ class ModernBertModel(BertModel):
        self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
        if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
        self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
@ -10696,79 +10604,6 @@ class JanusProVisionModel(MmprojModel):
        return []
@ModelBase.register("YoutuVLForConditionalGeneration")
 class YoutuVLVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self.hparams_vision is not None
        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
        # Handle activation function
        hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
        if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
            self.gguf_writer.add_vision_use_gelu(True)
        elif hidden_act == "silu":
            self.gguf_writer.add_vision_use_silu(True)
        else:
            raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
        self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
        window_size = self.hparams.get("window_size")
        if window_size is not None:
            self.gguf_writer.add_vision_window_size(window_size)
        # fullatt_block_indexes contains explicit layer indices that use full attention
        # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
        # All other layers use window attention
        fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
        assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
        # Store the explicit layer indices for YoutuVL (irregular pattern approach)
        self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        # Skip language model tensors
        skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
        if name.startswith(skip_prefixes):
            return []
        # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
        try:
            new_name = self.map_tensor_name(name)
            return [(new_name, data_torch)]
        except ValueError:
            # If mapping fails, log warning and skip
            logger.warning(f"Cannot map tensor: {name}")
            return []
@ModelBase.register("SolarOpenForCausalLM")
 class SolarOpenModel(Glm4MoeModel):
    model_arch = gguf.MODEL_ARCH.GLM4_MOE
    def set_vocab(self):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
        tokens, toktypes, tokpre = self.get_vocab_base()
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
 ###### CONVERSION LOGIC ######
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -145,8 +145,6 @@ models = [
    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
    {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
 ]
 # some models are known to be broken upstream, so we will skip them as exceptions
@ -167,8 +165,6 @@ pre_computed_hashes = [
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
 ]
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -327,7 +327,3 @@ Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. Whe
 ### GGML_CANN_PREFILL_USE_GRAPH
 Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled.
 ### GGML_CANN_OPERATOR_FUSION
 Enable operator fusion during computation, default is false. This option fuses compatible operators (e.g., ADD + RMS_NORM) to reduce overhead and improve performance.
--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@ -218,56 +218,6 @@ cmake .. -G Ninja `
 ninja
 ```
 ## Linux
 The two steps just above also apply to Linux. When building for linux, the commands are mostly the same as those for PowerShell on Windows, but in the second step they do not have the `-DCMAKE_TOOLCHAIN_FILE` parameter, and then in both steps the backticks are replaced with back slashes.
 If not installed already, install Git, CMake, Clang, Ninja and Python, then run in the terminal the following:
 ### I. Setup Environment
 1. **Install OpenCL Headers and Library**
 ```bash
 mkdir -p ~/dev/llm
 cd ~/dev/llm
 git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
 mkdir build && cd build
 cmake .. -G Ninja \
  -DBUILD_TESTING=OFF \
  -DOPENCL_HEADERS_BUILD_TESTING=OFF \
  -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF \
  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
 cmake --build . --target install
 cd ~/dev/llm
 git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
 mkdir build && cd build
 cmake .. -G Ninja \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" \
  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
 cmake --build . --target install
 ```
 ### II. Build llama.cpp
 ```bash
 mkdir -p ~/dev/llm
 cd ~/dev/llm
 git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
 mkdir build && cd build
 cmake .. -G Ninja \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" \
  -DBUILD_SHARED_LIBS=OFF \
  -DGGML_OPENCL=ON
 ninja
 ```
 ## Known Issues
 - Flash attention does not always improve performance.
--- a/docs/ops.md
+++ b/docs/ops.md
@ -22,7 +22,7 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -68,7 +68,7 @@ int main(int argc, char ** argv) {
    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;
-    std::vector<llama_sampler_seq_config> sampler_configs;
+    std::vector<llama_sampler *> samplers;
    for (int32_t i = 0; i < n_parallel; ++i) {
        llama_sampler * smpl = llama_sampler_chain_init(sparams);
@ -78,13 +78,7 @@ int main(int argc, char ** argv) {
        llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
        llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
-        sampler_configs.push_back({ i, smpl });
+        samplers.push_back(smpl);
    }
    // TODO: temporarily gated behind a flag
    if (params.sampling.backend_sampling) {
        ctx_params.samplers   = sampler_configs.data();
        ctx_params.n_samplers = sampler_configs.size();
    }
    llama_context * ctx = llama_init_from_model(model, ctx_params);
@ -186,7 +180,7 @@ int main(int argc, char ** argv) {
                continue;
            }
-            const llama_token new_token_id = llama_sampler_sample(sampler_configs[i].sampler, ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);
            // is it an end of generation? -> mark the stream as finished
            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
@ -242,15 +236,15 @@ int main(int argc, char ** argv) {
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
    LOG("\n");
-    llama_perf_sampler_print(sampler_configs[0].sampler);
+    llama_perf_sampler_print(samplers[0]);
    llama_perf_context_print(ctx);
    fprintf(stderr, "\n");
    llama_batch_free(batch);
-    for (auto & sampler_config : sampler_configs) {
+    for (auto & sampler_config : samplers) {
-        llama_sampler_free(sampler_config.sampler);
+        llama_sampler_free(sampler_config);
    }
    llama_free(ctx);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    // clear previous kv_cache values (irrelevant for embeddings)
@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
        }
-        float * out = output + embd_pos * n_embd_out;
+        float * out = output + embd_pos * n_embd;
-        common_embd_normalize(embd, out, n_embd_out, embd_norm);
+        common_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }
@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
    }
    // allocate output
-    const int n_embd_out = llama_model_n_embd_out(model);
+    const int n_embd = llama_model_n_embd(model);
-    std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
+    std::vector<float> embeddings(n_embd_count * n_embd, 0);
    float * emb = embeddings.data();
    // break into batches
@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
-            float * out = emb + e * n_embd_out;
+            float * out = emb + e * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
            s = 0;
            common_batch_clear(batch);
@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
    }
    // final batch
-    float * out = emb + e * n_embd_out;
+    float * out = emb + e * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
    if (params.embd_out.empty()) {
        LOG("\n");
@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
            for (int j = 0; j < n_embd_count; j++) {
                LOG("embedding %d: ", j);
-                for (int i = 0; i < std::min(3, n_embd_out); i++) {
+                for (int i = 0; i < std::min(3, n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG(" ... ");
-                for (int i = n_embd_out - 3; i < n_embd_out; i++) {
+                for (int i = n_embd - 3; i < n_embd; i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG("\n");
@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
                for (uint32_t i = 0; i < n_cls_out; i++) {
                    // NOTE: if you change this log - update the tests in ci/run.sh
                    if (n_cls_out == 1) {
-                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
                    } else {
-                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
                    }
                }
            }
@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
            // print the first part of the embeddings or for a single prompt, the full embedding
            for (int j = 0; j < n_prompts; j++) {
                LOG("embedding %d: ", j);
-                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG("\n");
@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
                LOG("\n");
                for (int i = 0; i < n_prompts; i++) {
                    for (int j = 0; j < n_prompts; j++) {
-                        float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                        LOG("%6.2f ", sim);
                    }
                    LOG("%1.10s", prompts[i].c_str());
@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
            LOG("[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
-                if (i < n_embd_out) LOG(","); else break;
+                if (i < n_embd) LOG(","); else break;
            }
            LOG(notArray ? "]\n    }" : "]");
            j++;
@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
                LOG("    [");
                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
-                    float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                    LOG("%6.2f", sim);
                    j++;
                    if (j < n_embd_count) LOG(", "); else break;
@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
        if (notArray) LOG("\n}\n");
    } else if (params.embd_out == "raw") {
-        print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
    }
    LOG("\n");
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
    std::vector<float> embd_out;
    if (embedding_mode) {
-        const int n_embd_out = llama_model_n_embd_out(model);
+        const int n_embd = llama_model_n_embd(model);
        const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
-        const int n_embeddings = n_embd_out * n_embd_count;
+        const int n_embeddings = n_embd * n_embd_count;
        float * embeddings;
        type = "-embeddings";
@ -177,7 +177,7 @@ int main(int argc, char ** argv) {
            embeddings = llama_get_embeddings(ctx);
        }
-        printf("Embedding dimension: %d\n", n_embd_out);
+        printf("Embedding dimension: %d\n", n_embd);
        printf("\n");
        // Print embeddings in the specified format
@ -185,16 +185,16 @@ int main(int argc, char ** argv) {
            printf("embedding %d: ", j);
            // Print first 3 values
-            for (int i = 0; i < 3 && i < n_embd_out; i++) {
+            for (int i = 0; i < 3 && i < n_embd; i++) {
-                printf("%9.6f ", embeddings[j * n_embd_out + i]);
+                printf("%9.6f ", embeddings[j * n_embd + i]);
            }
            printf(" ... ");
            // Print last 3 values
-            for (int i = n_embd_out - 3; i < n_embd_out; i++) {
+            for (int i = n_embd - 3; i < n_embd; i++) {
                if (i >= 0) {
-                    printf("%9.6f ", embeddings[j * n_embd_out + i]);
+                    printf("%9.6f ", embeddings[j * n_embd + i]);
                }
            }
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
    // allocate output
-    const int n_embd_out = llama_model_n_embd_out(model);
+    const int n_embd = llama_model_n_embd(model);
-    std::vector<float> embeddings(n_chunks * n_embd_out, 0);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
    float * emb = embeddings.data();
    // break into batches
@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
-            float * out = emb + p * n_embd_out;
+            float * out = emb + p * n_embd;
-            batch_process(ctx, batch, out, s, n_embd_out);
+            batch_process(ctx, batch, out, s, n_embd);
            common_batch_clear(batch);
            p += s;
            s = 0;
@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
    }
    // final batch
-    float * out = emb + p * n_embd_out;
+    float * out = emb + p * n_embd;
-    batch_process(ctx, batch, out, s, n_embd_out);
+    batch_process(ctx, batch, out, s, n_embd);
    // save embeddings to chunks
    for (int i = 0; i < n_chunks; i++) {
-        chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
        // clear tokens as they are no longer needed
        chunks[i].tokens.clear();
    }
@ -266,8 +266,8 @@ int main(int argc, char ** argv) {
        batch_add_seq(query_batch, query_tokens, 0);
-        std::vector<float> query_emb(n_embd_out, 0);
+        std::vector<float> query_emb(n_embd, 0);
-        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
+        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
        common_batch_clear(query_batch);
@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
        {
            std::vector<std::pair<int, float>> similarities;
            for (int i = 0; i < n_chunks; i++) {
-                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
                similarities.push_back(std::make_pair(i, sim));
            }
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 5)
+set(GGML_VERSION_PATCH 4)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -358,7 +358,7 @@ extern "C" {
    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
    // Tensor initialization
    GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -2053,7 +2053,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
    ggml_free(copy.ctx_unallocated);
 }
-bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
+bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
    if (copy.buffer == NULL) {
        return false;
@ -2064,22 +2064,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
    assert(g1->n_nodes == g2->n_nodes);
-    if (num_test_nodes != 0) {
+    if (test_node != nullptr) {
-        GGML_ASSERT(test_nodes);
+        // Compute the whole graph and only test the output for a specific tensor
        // Compute the whole graph and only test the output for specific tensors
        ggml_backend_graph_compute(backend1, g1);
        ggml_backend_graph_compute(backend2, g2);
-        bool verified = false;
+        int test_node_idx = -1;
        for (int i = 0; i < g1->n_nodes; i++) {
-            for (size_t j = 0; j < num_test_nodes; ++j) {
+            struct ggml_tensor * t1 = g1->nodes[i];
-                if (g1->nodes[i] == test_nodes[j]) {
+            if (t1 == test_node) {
-                    callback(i, g1->nodes[i], g2->nodes[i], user_data);
+                test_node_idx = i;
-                    verified = true;
+                break;
            }
        }
-        }
+        GGML_ASSERT(test_node_idx != -1);
-        GGML_ASSERT(verified);
+
        callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
    } else {
        for (int i = 0; i < g1->n_nodes; i++) {
            struct ggml_tensor * t1 = g1->nodes[i];
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -26,7 +26,6 @@
 #include "ggml.h"
 #include <aclnnop/aclnn_add.h>
 #include <aclnnop/aclnn_add_rms_norm.h>
 #include <aclnnop/aclnn_addcdiv.h>
 #include <aclnnop/aclnn_argmax.h>
 #include <aclnnop/aclnn_avgpool2d.h>
@ -3806,57 +3805,3 @@ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
                            cubeMathType);
 }
 void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
                                     ggml_tensor *               add_node,
                                     ggml_tensor *               rms_norm_node) {
    // Get the two input tensors for ADD operation
    ggml_tensor * x1 = add_node->src[0];
    ggml_tensor * x2 = add_node->src[1];
    // Create ACL tensors for the two ADD inputs
    acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1);
    acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2);
    // Get epsilon parameter from rms_norm_tensor
    float eps;
    memcpy(&eps, rms_norm_node->op_params, sizeof(float));
    // Build gamma tensor (RMS normalization scaling factor)
    // Gamma should match the normalized dimensions (last dimension of x1)
    size_t acl_gamma_nb[GGML_MAX_DIMS];
    acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
        acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1];
    }
    acl_tensor_ptr acl_gamma =
        get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne,
                             acl_gamma_nb, rms_norm_node->type,
                             1,    // dims - only the last dimension
                             1.0f  // value
        );
    // Build rstdOut tensor (output for normalized standard deviation)
    // Shape should be the dimensions that are NOT normalized
    int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] };
    size_t  acl_rstd_nb[GGML_MAX_DIMS - 1];
    acl_rstd_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
        acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
    }
    acl_tensor_ptr acl_rstd =
        get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
                             acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS,
                             0.0f  // value
        );
    acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node);
    // Create yOut tensor (final output after RMS normalization)
    acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node);
    // Call fused ADD + RMS_NORM operator
    GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(),
                            eps,  // double type
                            acl_yout.get(), acl_rstd.get(), acl_xout.get());
 }
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -935,20 +935,6 @@ template <typename... Args> void register_acl_resources(std::vector<any_acl_reso
 */
 void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 /**
 * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
 *
 * This function fuses the ADD and RMS_NORM operations into a single kernel call
 * for better performance. It first adds two input tensors (x1 + x2), then applies
 * RMS normalization to the result.
 *
 * @param ctx The context for the CANN backend operations.
 * @param dst The ADD operation node, contains the two input tensors to be added.
 * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
 *                        and epsilon parameter.
 */
 void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx, ggml_tensor * add_node, ggml_tensor * rms_norm_node);
 /**
 * @brief   Check whether a tensor is a weight tensor for matrix multiplication.
 *
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -122,7 +122,7 @@ std::optional<std::string> get_env(const std::string & name) {
 * @brief Verify whether the environment variable is a valid value.
 */
 bool parse_bool(const std::string & value) {
-    static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
+    std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
    return valid_values.find(value) != valid_values.end();
 }
@ -1888,7 +1888,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
            break;
        case GGML_OP_OUT_PROD:
            ggml_cann_out_prod(ctx, dst);
            break;
        case GGML_OP_SSM_CONV:
            ggml_cann_ssm_conv(ctx, dst);
            break;
@ -2078,40 +2077,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }
 /**
 * @brief Check if CANN backend can fuse the specified operation sequence
 *
 * This function determines whether an operation sequence starting from the specified node
 * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
 * memory access overhead and improve computational efficiency.
 *
 * @param cgraph Pointer to the computation graph
 * @param node_idx Index of the starting node in the computation graph
 * @param ops Sequence of operation types to check for fusion
 * @return true if the operations can be fused
 * @return false if the operations cannot be fused
 */
 static bool ggml_cann_can_fuse(const struct ggml_cgraph *          cgraph,
                               int                                 node_idx,
                               std::initializer_list<enum ggml_op> ops) {
    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
        return false;
    }
    // CANN backend supports fusing ADD + RMS_NORM operations
    if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
        ggml_tensor * add_node = cgraph->nodes[node_idx];
        // TODO: support broadcast for ADD + RMS_NORM
        if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
            add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
            return false;
        }
        return true;
    }
    return false;
 }
 /**
 * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
 *
@ -2136,18 +2101,9 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
 #endif  // USE_ACL_GRAPH
    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
    // With the use of CANN graphs, the execution will be performed by the graph launch.
    static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or(""));
    if (!use_cann_graph || cann_graph_capture_required) {
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];
            if (opt_fusion) {
                if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
                    ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
                    i++;
                    continue;
                }
            }
            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
                node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -54,20 +54,6 @@ if (CUDAToolkit_FOUND)
    enable_language(CUDA)
    # TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
    if (GGML_CUDA_CUB_3DOT2)
        include(FetchContent)
        FetchContent_Declare(
            CCCL
            GIT_REPOSITORY https://github.com/nvidia/cccl.git
            GIT_TAG        v3.2.0-rc2
            GIT_SHALLOW    TRUE
        )
        FetchContent_MakeAvailable(CCCL)
    endif()
    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
    # 12X is forwards-compatible, 12Xa is not.
    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
@ -157,9 +143,6 @@ if (CUDAToolkit_FOUND)
            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
        else ()
            if (GGML_CUDA_CUB_3DOT2)
                target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
            endif()
            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            else()
@ -167,9 +150,6 @@ if (CUDAToolkit_FOUND)
            endif()
        endif()
    else()
        if (GGML_CUDA_CUB_3DOT2)
            target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
        endif()
        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
    endif()
@ -238,10 +218,6 @@ if (CUDAToolkit_FOUND)
    if (NOT MSVC)
        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
    else()
        # CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
        # https://github.com/NVIDIA/cccl/pull/6827
        list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
    endif()
    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@ -22,7 +22,7 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr
 }
 #ifdef GGML_CUDA_USE_CUB
-void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
+static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                     const float *    x,
                                     int *            dst,
                                     const int        ncols,
@ -49,49 +49,28 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
    size_t temp_storage_bytes = 0;
    if (order == GGML_SORT_ORDER_ASC) {
-        if (nrows == 1) {
+        DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
            DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
                                       temp_indices, dst,                                  // values (indices)
                                       ncols, 0, sizeof(float) * 8, stream);
        } else {
            DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
                                            temp_indices, dst,                                  // values (indices)
                                            ncols * nrows, nrows,                            // num items, num segments
-                                           d_offsets, d_offsets + 1, stream);
+                                            d_offsets, d_offsets + 1, 0, sizeof(float) * 8,  // all bits
-        }
+                                            stream);
    } else {
-        if (nrows == 1) {
+        DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
-            DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
+                                                      dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
-                                                 temp_indices, dst,                                  // values (indices)
+                                                      sizeof(float) * 8, stream);
                                                 ncols, 0, sizeof(float) * 8, stream);
        } else {
            DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
                                                     dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
        }
    }
    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
    void *                        d_temp_storage = temp_storage_alloc.get();
    if (order == GGML_SORT_ORDER_ASC) {
-        if (nrows == 1) {
+        DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
-            DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
+                                            ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
                                       temp_indices, dst,  // values (indices)
                                       ncols, 0, sizeof(float) * 8, stream);
        } else {
            DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
                                           ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
        }
    } else {
        if (nrows == 1) {
            DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
                                                 temp_indices, dst,                                  // values (indices)
                                                 ncols, 0, sizeof(float) * 8, stream);
        } else {
            DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
                                                     temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
                                            stream);
-        }
+    } else {
        DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
                                                      temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
                                                      0, sizeof(float) * 8, stream);
    }
 }
 #endif  // GGML_CUDA_USE_CUB
@ -162,7 +141,7 @@ static int next_power_of_2(int x) {
    return n;
 }
-void argsort_f32_i32_cuda_bitonic(const float *   x,
+static void argsort_f32_i32_cuda_bitonic(const float *   x,
                                         int *           dst,
                                         const int       ncols,
                                         const int       nrows,
--- a/ggml/src/ggml-cuda/argsort.cuh
+++ b/ggml/src/ggml-cuda/argsort.cuh
@ -1,19 +1,3 @@
 #include "common.cuh"
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 #ifdef GGML_CUDA_USE_CUB
 void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                              const float *    x,
                              int *            dst,
                              const int        ncols,
                              const int        nrows,
                              ggml_sort_order  order,
                              cudaStream_t     stream);
 #endif  // GGML_CUDA_USE_CUB
 void argsort_f32_i32_cuda_bitonic(const float *   x,
                                  int *           dst,
                                  const int       ncols,
                                  const int       nrows,
                                  ggml_sort_order order,
                                  cudaStream_t    stream);
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -959,7 +959,6 @@ struct ggml_cuda_device_info {
        size_t  vmm_granularity;    // granularity of virtual memory
        size_t  total_vram;
        int     warp_size;          // Number of threads in a dispatch
        bool    supports_cooperative_launch;    // whether cooperative launch is supported
    };
    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
@ -1059,13 +1058,12 @@ struct ggml_cuda_graph {
    cudaGraphExec_t instance = nullptr;
    size_t num_nodes = 0;
    std::vector<cudaGraphNode_t> nodes;
    std::vector<cudaKernelNodeParams> params;
    bool disable_due_to_gpu_arch = false;
    bool disable_due_to_too_many_updates = false;
    bool disable_due_to_failed_graph_capture = false;
    int number_consecutive_updates = 0;
    bool cuda_graphs_enabled = false;
    std::vector<ggml_graph_node_properties> ggml_graph_properties;
    std::vector<ggml_graph_node_properties> extraneous_srcs_properties;
 #endif
 };
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@ -12,11 +12,11 @@ const int CUDA_CPY_BLOCK_NM = 8;     // block size of 3rd dimension if available
 const int CUDA_CPY_BLOCK_ROWS = 8;   // block dimension for marching through rows
 template <cpy_kernel_t cpy_1>
-static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne,
+static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
-                                  const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                  const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                  const int64_t nb12, const int64_t nb13) {
+                                  const int nb12, const int nb13) {
-    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
@ -40,10 +40,10 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
 }
 template <typename T>
-static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne,
+static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
-                               const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+                               const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                               const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+                               const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                               const int64_t nb12, const int64_t nb13) {
+                               const int nb12, const int nb13) {
    const T* src = reinterpret_cast<const T*>(cx);
    T* dst = reinterpret_cast<T*>(cdst);
@ -117,60 +117,60 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
 }
 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int64_t nb12, const int64_t nb13) {
+                                 const int nb12, const int nb13) {
-    const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
    if (i >= ne) {
        return;
    }
-    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int i03 = i/(ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int i13 = i/(ne10 * ne11 * ne12);
-    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
-                                 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int64_t nb12, const int64_t nb13) {
+                                 const int nb12, const int nb13) {
-    const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
    if (i >= ne) {
        return;
    }
-    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int i03 = i/(ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int i13 = i/(ne10 * ne11 * ne12);
-    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 template<typename src_t, typename dst_t>
 static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
-    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
@ -188,20 +188,19 @@ static void ggml_cpy_scalar_contiguous_cuda(
 cudaStream_t stream) {
    const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne);
 }
 template<typename src_t, typename dst_t, bool transposed = false>
 static void ggml_cpy_scalar_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    if (transposed) {
        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
-        int64_t ne00n, ne01n, ne02n;
+        int ne00n, ne01n, ne02n;
        if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
            ne00n = ne00;
            ne01n = ne01;
@ -212,159 +211,143 @@ static void ggml_cpy_scalar_cuda(
            ne02n = 1;
        }
-        int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
+        dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
-        int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
+                      (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
-        int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
+                      (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
        GGML_ASSERT(grid_x < UINT_MAX);
        GGML_ASSERT(grid_y < USHRT_MAX);
        GGML_ASSERT(grid_z < USHRT_MAX);
        dim3 dimGrid(grid_x, grid_y, grid_z);
        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
        cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
            (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    } else {
-        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+        const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
        GGML_ASSERT(num_blocks < UINT_MAX);
        cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
            (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    }
 }
 static void ggml_cpy_f32_q8_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK8_0 == 0);
-    const int64_t num_blocks = ne / QK8_0;
+    const int num_blocks = ne / QK8_0;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q8_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-    const int64_t num_blocks = ne;
+    const int num_blocks = ne;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_q4_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK4_0 == 0);
-    const int64_t num_blocks = ne / QK4_0;
+    const int num_blocks = ne / QK4_0;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q4_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int ne00, const int ne01, const int ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+    const int nb03, const int ne10, const int ne11, const int ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
+    const int num_blocks = ne;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_q4_1_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK4_1 == 0);
-    const int64_t num_blocks = ne / QK4_1;
+    const int num_blocks = ne / QK4_1;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q4_1_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int ne00, const int ne01, const int ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+    const int nb03, const int ne10, const int ne11, const int ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
+    const int num_blocks = ne;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_q5_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK5_0 == 0);
-    const int64_t num_blocks = ne / QK5_0;
+    const int num_blocks = ne / QK5_0;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q5_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int ne00, const int ne01, const int ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+    const int nb03, const int ne10, const int ne11, const int ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
+    const int num_blocks = ne;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_q5_1_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK5_1 == 0);
-    const int64_t num_blocks = ne / QK5_1;
+    const int num_blocks = ne / QK5_1;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q5_1_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int ne00, const int ne01, const int ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+    const int nb03, const int ne10, const int ne11, const int ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
+    const int num_blocks = ne;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_iq4_nl_cuda(
-    const char * cx, char * cdst, const int64_t ne,
+    const char * cx, char * cdst, const int ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    GGML_ASSERT(ne % QK4_NL == 0);
-    const int64_t num_blocks = ne / QK4_NL;
+    const int num_blocks = ne / QK4_NL;
    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@ -373,6 +356,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));
    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@ -5,7 +5,7 @@
 #include "ggml.h"
 #ifdef GGML_CUDA_USE_CUB
-#   include <cub/cub.cuh>
+#   include <cub/block/block_scan.cuh>
 #endif // GGML_CUDA_USE_CUB
 template<typename T, int BLOCK_SIZE>
@ -185,34 +185,9 @@ static __global__ void cumsum_kernel(
    }
 }
 #ifdef GGML_CUDA_USE_CUB
 template <typename T>
 static void cumsum_cub(ggml_cuda_pool & pool,
                       const T *        src,
                       T *              dst,
                       int64_t          ne,
                       cudaStream_t     stream) {
    size_t tmp_size = 0;
    // Query how much temp storage CUDA UnBound (CUB) needs
    cub::DeviceScan::InclusiveSum(nullptr,   // d_temp_storage (null = just query size)
                                  tmp_size,  // reference to size (will be set by CUB)
                                  src,       // input pointer
                                  dst,       // output pointer
                                  ne,        // number of elements
                                  stream     // CUDA stream to use
    );
    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
    // Perform the inclusive scan
    cub::DeviceScan::InclusiveSum((void *) tmp_alloc.get(), tmp_size, src, dst, ne, stream);
 }
 #endif // GGML_CUDA_USE_CUB
 template<typename T>
 static void cumsum_cuda(
-        [[maybe_unused]] ggml_backend_cuda_context & ctx, const T * src, T * dst,
+        const T * src, T * dst,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
        const int64_t  nb0,  const int64_t nb1, const int64_t  nb2, const int64_t  nb3,
@ -226,15 +201,6 @@ static void cumsum_cuda(
    if (is_contiguous) {
        use_cub = true;
        const int64_t nrows = ne01 * ne02 * ne03;
        // TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released
        // Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004
        if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) {
            for (int i=0; i<nrows; i++) {
                cumsum_cub(ctx.pool(), src + i * ne00, dst + i * ne00, ne00, stream);
            }
            return;
        }
    }
 #endif // GGML_CUDA_USE_CUB
    dim3 grid_dims(ne01, ne02, ne03);
@ -273,7 +239,7 @@ void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        case GGML_TYPE_F32:
            {
                cumsum_cuda(
-                    ctx, (const float *)src0->data, (float *)dst->data,
+                    (const float *)src0->data, (float *)dst->data,
                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -11,12 +11,10 @@
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
 // log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
-//     by the VKQ accumulators is effectively being shifted up by a factor of 2.
+//     by the VKQ accumulators is effectively being shifted up by a factor of 8.
 // This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
 // However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
-// Still, the value range should be shifted as much as necessary but as little as possible.
+#define FATTN_KQ_MAX_OFFSET 0.6931f
 // The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
 #define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
 typedef void (* fattn_kernel_t)(
        const char * __restrict__ Q,
@ -920,9 +918,7 @@ void launch_fattn(
        blocks_num.y = 1;
        blocks_num.z = 1;
-        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
            dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
        }
    } else {
        const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -19,7 +19,6 @@
 #include "ggml-cuda/count-equal.cuh"
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
@ -45,7 +44,6 @@
 #include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/top-k.cuh"
 #include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/topk-moe.cuh"
@ -203,6 +201,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
    int64_t total_vram = 0;
 #ifdef GGML_CUDA_FORCE_MMQ
    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    yes\n", __func__);
 #else
    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    no\n", __func__);
 #endif // GGML_CUDA_FORCE_MMQ
 #ifdef GGML_CUDA_FORCE_CUBLAS
    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
 #else
    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
 #endif // GGML_CUDA_FORCE_CUBLAS
    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
    std::vector<std::pair<int, std::string>> turing_devices_without_mma;
@ -233,14 +241,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
        info.devices[id].nsm        = prop.multiProcessorCount;
        info.devices[id].smpb       = prop.sharedMemPerBlock;
        info.devices[id].warp_size  = prop.warpSize;
 #ifndef GGML_USE_MUSA
        int supports_coop_launch = 0;
        CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, id));
        info.devices[id].supports_cooperative_launch = !!supports_coop_launch;
 #else
        info.devices[id].supports_cooperative_launch = false;
 #endif // !(GGML_USE_MUSA)
 #if defined(GGML_USE_HIP)
        info.devices[id].smpbo = prop.sharedMemPerBlock;
@ -2687,9 +2687,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SUM:
            ggml_cuda_op_sum(ctx, dst);
            break;
        case GGML_OP_CUMSUM:
            ggml_cuda_op_cumsum(ctx, dst);
            break;
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
@ -2702,9 +2699,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SSM_SCAN:
            ggml_cuda_op_ssm_scan(ctx, dst);
            break;
        case GGML_OP_TOP_K:
            ggml_cuda_op_top_k(ctx, dst);
            break;
        case GGML_OP_ARGSORT:
            ggml_cuda_op_argsort(ctx, dst);
            break;
@ -2714,6 +2708,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
        case GGML_OP_CUMSUM:
            ggml_cuda_op_cumsum(ctx, dst);
            break;
        case GGML_OP_TRI:
            ggml_cuda_op_tri(ctx, dst);
            break;
@ -2973,16 +2970,15 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
    }
    // Check if the graph size has changed
-    if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
+    if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
        cuda_graph_update_required = true;
-        cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes + cgraph->n_leafs);
+        cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
    }
    // Loop over nodes in GGML graph to determine if CUDA graph update is required
    // and store properties to allow this comparison for the next token
    for (int i = 0; i < cgraph->n_nodes; i++) {
        bool has_matching_properties = true;
        if (!cuda_graph_update_required) {
            has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
        }
@ -2992,17 +2988,6 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
        set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
    }
    for (int i = 0; i < cgraph->n_leafs; i++) {
        bool has_matching_properties = true;
        if (!cuda_graph_update_required) {
            has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->leafs[i], &cuda_ctx->cuda_graph->ggml_graph_properties[cgraph->n_nodes + i]);
        }
        if (!has_matching_properties) {
            cuda_graph_update_required = true;
        }
        set_ggml_graph_node_properties(cgraph->leafs[i], &cuda_ctx->cuda_graph->ggml_graph_properties[cgraph->n_nodes + i]);
    }
    return cuda_graph_update_required;
 }
@ -3278,7 +3263,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                    should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
                }
            }
            if (should_launch_concurrent_events) {
                // Restore original node order within each concurrent region to enable fusion within streams
@ -3330,8 +3314,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                        cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
                    }
                }
            } else {
                stream_ctx.concurrent_events.clear();
            }
            for (int i = 0; i < cgraph->n_nodes; i++) {
@ -3720,7 +3702,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
    }
 }
-static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_cuda_set_device(cuda_ctx->device);
 #ifdef USE_CUDA_GRAPH
    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
@ -3731,6 +3716,7 @@ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ct
    }
    bool use_cuda_graph = true;
    bool cuda_graph_update_required = false;
    if (cuda_ctx->cuda_graph->graph == nullptr) {
        if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
@ -3751,27 +3737,6 @@ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ct
        use_cuda_graph = false;
    }
    cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
 #else
    bool use_cuda_graph = false;
 #endif // USE_CUDA_GRAPH
    return use_cuda_graph;
 }
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
    ggml_cuda_set_device(cuda_ctx->device);
    bool use_cuda_graph             = false;
    bool cuda_graph_update_required = false;
    // graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
    // we call it here instead.
 #ifdef USE_CUDA_GRAPH
    use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
    if (use_cuda_graph) {
        cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
@ -3786,13 +3751,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
            cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
 #ifndef NDEBUG
            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
        }
    }
 #endif // USE_CUDA_GRAPH
    if (use_cuda_graph && cuda_graph_update_required) {
        // Start CUDA graph capture
@ -3804,6 +3767,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
    }
 #else
    bool use_cuda_graph = false;
    bool cuda_graph_update_required = false;
 #endif // USE_CUDA_GRAPH
    bool graph_evaluated_or_captured = false;
    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
@ -3839,8 +3807,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
    const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
    static bool enable_graph_optimization = [] {
        const char * env = getenv("GGML_CUDA_GRAPH_OPT");
        return env != nullptr && atoi(env) == 1;
@ -3850,13 +3816,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
        return;
    }
    GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
    GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
    ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
    stream_context.reset();
    if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
        return;
    }
    // number of out-degrees for a particular node
    std::unordered_map<const ggml_tensor *, int> fan_out;
    // reverse mapping of node to index in the cgraph
@ -3917,12 +3882,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
        if (count >= min_fan_out && count <= max_fan_out) {
            const int root_node_idx = node_indices[root_node];
            // only optimize for attn_norm
            // TODO: make this more generic
            if (!strstr(root_node->name, "attn_norm")) {
                continue;
            }
            bool is_part_of_event = false;
            for (const auto & [start, end] : concurrent_node_ranges) {
                if (root_node_idx >= start && root_node_idx <= end) {
@ -4651,7 +4610,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            return true;
        case GGML_OP_SUM:
            return ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_TOP_K:
        case GGML_OP_ARGSORT:
 #ifndef GGML_CUDA_USE_CUB
            return op->src[0]->ne[0] <= 1024;
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@ -1,14 +1,6 @@
 #include "common.cuh"
 #include "ggml.h"
 #include "softmax.cuh"
 #ifdef GGML_USE_HIP
 #include <hip/hip_cooperative_groups.h>
 #else
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #endif // GGML_USE_HIP
 #include <cstdint>
 #include <utility>
@ -168,156 +160,6 @@ static __global__ void soft_max_f32(
        dst[col] = vals[col] * inv_sum;
    }
 }
 // TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated
 static __device__ float two_stage_warp_reduce_max(float val) {
    val = warp_reduce_max(val);
    if (blockDim.x > WARP_SIZE) {
        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
        __shared__ float local_vals[32];
        const int        warp_id = threadIdx.x / WARP_SIZE;
        const int        lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            local_vals[warp_id] = val;
        }
        __syncthreads();
        val = -INFINITY;
        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
            val = local_vals[lane_id];
        }
        return warp_reduce_max(val);
    } else {
        return val;
    }
 }
 static __device__ float two_stage_warp_reduce_sum(float val) {
    val = warp_reduce_sum(val);
    if (blockDim.x > WARP_SIZE) {
        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
        __shared__ float local_vals[32];
        const int        warp_id = threadIdx.x / WARP_SIZE;
        const int        lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            local_vals[warp_id] = val;
        }
        __syncthreads();
        val = 0.0f;
        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
            val = local_vals[lane_id];
        }
        return warp_reduce_sum(val);
    } else {
        return val;
    }
 }
 // TODO: Template to allow keeping ncols in registers if they fit
 static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
                                                                float * __restrict__ dst,
                                                                float * __restrict__ tmp_maxs,
                                                                float * __restrict__ tmp_sums,
                                                                const soft_max_params p) {
    namespace cg = cooperative_groups;
    const cg::grid_group g = cg::this_grid();
    const int tid               = threadIdx.x;
    const int col_start         = blockIdx.x * blockDim.x + tid;
    const int n_elem_per_thread = 4;
    float     local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
    float     local_max                     = -INFINITY;
    const int step_size                     = gridDim.x * blockDim.x;
    // Compute thread-local max
    for (int col = col_start; col < p.ncols;) {
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            const int idx = col + i * step_size;
            local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
        }
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            local_max = fmaxf(local_max, local_vals[i]);
        }
        col += step_size * n_elem_per_thread;
    }
    // Compute CTA-level max
    local_max = two_stage_warp_reduce_max(local_max);
    // Store CTA-level max to GMEM
    if (tid == 0) {
        tmp_maxs[blockIdx.x] = local_max;
    }
    g.sync();
    // Compute compute global max from CTA-level maxs
    assert(gridDim.x < blockDim.x);  // currently we only support this case
    if (tid < gridDim.x) {
        local_max = tmp_maxs[tid];
    } else {
        local_max = -INFINITY;
    }
    local_max = two_stage_warp_reduce_max(local_max);
    // Compute softmax dividends, accumulate divisor
    float tmp_expf = 0.0f;
    for (int col = col_start; col < p.ncols;) {
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            const int idx = col + i * step_size;
            local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
        }
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            const int idx = col + i * step_size;
            if (idx < p.ncols) {
                const float tmp = expf(local_vals[i] - local_max);
                tmp_expf += tmp;
                dst[idx] = tmp;
            }
        }
        col += step_size * n_elem_per_thread;
    }
    // Reduce divisor within CTA
    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
    // Store CTA-level sum to GMEM
    if (tid == 0) {
        tmp_sums[blockIdx.x] = tmp_expf;
    }
    g.sync();
    // Compute global sum from CTA-level sums
    if (tid < gridDim.x) {
        tmp_expf = tmp_sums[tid];
    } else {
        tmp_expf = 0.0f;
    }
    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
    // Divide dividend by global sum + store data
    for (int col = col_start; col < p.ncols;) {
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            const int idx = col + i * step_size;
            local_vals[i] = idx < p.ncols ? dst[idx] : -INFINITY;
        }
 #pragma unroll
        for (int i = 0; i < n_elem_per_thread; i++) {
            const int idx = col + i * step_size;
            if (idx < p.ncols) {
                dst[idx] = local_vals[i] / tmp_expf;
            }
        }
        col += step_size * n_elem_per_thread;
    }
 }
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif // __clang__
@ -374,31 +216,9 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float
    soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, sinks, dst, p);
 }
 __launch_bounds__(8*WARP_SIZE, 1) static __global__ void soft_max_f32_parallelize_cols(const float * __restrict__ x,
                                                     float * __restrict__ dst,
                                                     float * __restrict__ tmp_maxs,
                                                     float * __restrict__ tmp_sums,
                                                     const soft_max_params p)
 // We loop over all instead of parallelizing across gridDim.y as cooperative groups
 // currently only support synchronizing the complete grid if not launched as a cluster group
 // (which requires CC > 9.0)
 // https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#grid-synchronization
 // https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#class-cluster-group
 {
    for (int rowx = 0; rowx < p.ne01 * p.ne02 * p.ne03; rowx++) {
        soft_max_f32_parallelize_cols_single_row(x + int64_t(rowx) * p.ncols, dst + int64_t(rowx) * p.ncols, tmp_maxs,
                                                 tmp_sums, p);
    }
 }
-template <typename T>
+template<typename T>
-static void soft_max_f32_cuda(const float *                                x,
+static void soft_max_f32_cuda(const float * x, const T * mask, const float * sinks, float * dst, const soft_max_params & params, cudaStream_t stream) {
                              const T *                                    mask,
                              const float *                                sinks,
                              float *                                      dst,
                              const soft_max_params &                      params,
                              cudaStream_t                                 stream,
                              [[maybe_unused]] ggml_backend_cuda_context & ctx) {
    int nth = WARP_SIZE;
    const int64_t ncols_x = params.ncols;
@ -416,25 +236,8 @@ static void soft_max_f32_cuda(const float *                                x,
    if (nbytes_shared <= smpbo) {
        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared);
    } else {
-        // Parallelize across SMs for top-p/dist-sampling
+        const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
-        // The heuristic for parallelizing rows across SMs vs parallelizing single row & looping over all rows was done on the basis of a B6000 GPU and
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
        // Can be adapted further for lower-SM-count GPUs, though keeping data in registers should be implemented first as that is the optimal solution.
        if (ggml_cuda_info().devices[id].supports_cooperative_launch &&
            ncols_x / (params.ne01 * params.ne02 * params.ne03) > 8192 && mask == nullptr && sinks == nullptr &&
            params.scale == 1.0f && params.max_bias == 0.0f) {
            ggml_cuda_pool_alloc<float> tmp_maxs_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
            ggml_cuda_pool_alloc<float> tmp_sums_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
            void * kernel_args[] = { (void *) &x, (void *) &dst, (void *) &tmp_maxs_alloc.ptr,
                                     (void *) &tmp_sums_alloc.ptr, (void *) const_cast<soft_max_params *>(&params) };
            CUDA_CHECK(cudaLaunchCooperativeKernel((void *) soft_max_f32_parallelize_cols,
                                                   dim3(ggml_cuda_info().devices[id].nsm, 1, 1),
                                                   dim3(WARP_SIZE * 8, 1, 1), kernel_args, 0, stream));
        } else {
            const size_t nbytes_shared_low = WARP_SIZE * sizeof(float);
            soft_max_f32<false, 0, 0>
                <<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
        }
    }
 }
@ -512,9 +315,9 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    params.m1 = m1;
    if (use_f16) {
-        soft_max_f32_cuda(src0_d, (const half *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx);
+        soft_max_f32_cuda(src0_d, (const half  *) src1_d, (const float *) src2_d, dst_d, params, stream);
    } else {
-        soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx);
+        soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream);
    }
 }
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@ -1,96 +0,0 @@
 #include "argsort.cuh"
 #include "top-k.cuh"
 #ifdef GGML_CUDA_USE_CUB
 #    include <cub/cub.cuh>
 #    if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
 #        include <cuda/iterator>
 #        define CUB_TOP_K_AVAILABLE
 using namespace cub;
 #    endif  // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
 #endif      // GGML_CUDA_USE_CUB
 #ifdef CUB_TOP_K_AVAILABLE
 static void top_k_cub(ggml_cuda_pool & pool,
                      const float *    src,
                      int *            dst,
                      const int        ncols,
                      const int        k,
                      cudaStream_t     stream) {
    auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed,
                                                 cuda::execution::output_ordering::unsorted);
    auto stream_env   = cuda::stream_ref{ stream };
    auto env          = cuda::std::execution::env{ stream_env, requirements };
    auto indexes_in = cuda::make_counting_iterator(0);
    size_t temp_storage_bytes = 0;
    DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
                         env);
    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
    void *                        d_temp_storage = temp_storage_alloc.get();
    DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
                         ncols, k, env);
 }
 #elif defined(GGML_CUDA_USE_CUB)  // CUB_TOP_K_AVAILABLE
 static int next_power_of_2(int x) {
    int n = 1;
    while (n < x) {
        n *= 2;
    }
    return n;
 }
 #endif                            // CUB_TOP_K_AVAILABLE
 void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0   = dst->src[0];
    const float *       src0_d = (const float *) src0->data;
    int *               dst_d  = (int *) dst->data;
    cudaStream_t        stream = ctx.stream();
    // are these asserts truly necessary?
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    const int64_t    ncols = src0->ne[0];
    const int64_t    nrows = ggml_nrows(src0);
    const int64_t    k     = dst->ne[0];
    ggml_cuda_pool & pool  = ctx.pool();
 #ifdef CUB_TOP_K_AVAILABLE
    // TODO: Switch to `DeviceSegmentedTopK` for multi-row TopK once implemented
    // https://github.com/NVIDIA/cccl/issues/6391
    // TODO: investigate if there exists a point where parallelized argsort is faster than sequential top-k
    for (int i = 0; i < nrows; i++) {
        top_k_cub(pool, src0_d + i * ncols, dst_d + i * k, ncols, k, stream);
    }
 #elif defined(GGML_CUDA_USE_CUB)  // CUB_TOP_K_AVAILABLE
    // Fall back to argsort + copy
    const int    ncols_pad      = next_power_of_2(ncols);
    const size_t shared_mem     = ncols_pad * sizeof(int);
    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
    int *                     tmp_dst = temp_dst_alloc.get();
    if (shared_mem > max_shared_mem || ncols > 1024) {
        argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
    } else {
        argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
    }
    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
                                 cudaMemcpyDeviceToDevice, stream));
 #else                             // GGML_CUDA_USE_CUB
    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
    int *                     tmp_dst = temp_dst_alloc.get();
    argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
                                 cudaMemcpyDeviceToDevice, stream));
 #endif
 }
--- a/ggml/src/ggml-cuda/top-k.cuh
+++ b/ggml/src/ggml-cuda/top-k.cuh
@ -1,3 +0,0 @@
 #include "common.cuh"
 void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -45,11 +45,9 @@
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
 #define cudaDevAttrCooperativeLaunch hipDeviceAttributeCooperativeLaunch
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceGetAttribute hipDeviceGetAttribute
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@ -72,7 +70,6 @@
 #define cudaHostRegisterPortable hipHostRegisterPortable
 #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 #define cudaHostUnregister hipHostUnregister
 #define cudaLaunchCooperativeKernel hipLaunchCooperativeKernel
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -61,7 +61,6 @@
 #define cudaHostRegisterPortable musaHostRegisterPortable
 #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
 #define cudaHostUnregister musaHostUnregister
 #define cudaLaunchCooperativeKernel musaLaunchCooperativeKernel
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@ -85,16 +85,13 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_spad *         dst_spad,
                                       uint32_t                  nth,
                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
+                                       uint32_t                  src0_nrows_per_thread) {
                                       dma_queue *               dma_queue) {
    htp_act_preamble3;
    size_t src0_row_size = nb01;
    size_t src1_row_size = nb11;
    size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
@ -108,6 +105,12 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        is_aligned = 0;
        FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
@ -124,81 +127,37 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
        data_src1 += swapped ? 0 : nc_in_bytes;
    }
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
-    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+    const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
+        if (ir + 1 < src0_end_row) {
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
    if (BLOCK == 0) {
        FARF(ERROR,
             "swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
             src0_spad->size_per_thread, src0_row_size_aligned);
        return;
        }
-    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+        if (opt_path) {
-    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
-        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+            hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
                                (uint8_t *) dst, nc);
        } else {
            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, nc, true);
            hvx_add_scalar_f32(src0_spad_data, 1.0, src1_spad_data, nc);
            hvx_inverse_f32(src1_spad_data, src0_spad_data, nc);
-        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, dst_spad_data, nc);
-        dma_queue_push_vtcm_to_ddr(dma_queue,
+            hvx_mul_f32(dst_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc);
            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
            dst_row_size, dst_row_size_aligned, 0);
        dma_queue_push_ddr_to_vtcm(dma_queue,
            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
            src0_row_size_aligned, src0_row_size, block_size);
        dma_queue_push_ddr_to_vtcm(dma_queue,
            dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
            src1_row_size_aligned, src1_row_size, block_size);
    }
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
        float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
        for (uint32_t ib = 0; ib < block_size; ib++) {
            const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
            const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
            //swiglu(x) = x1 * sigmoid(x0)
            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
        }
        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
                                   dst_row_size_aligned, block_size);
        // prefetch N+2 loop iteration if any
        const uint32_t pref_block = (ir + BLOCK * 2);
        if (pref_block < src0_end_row) {
            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
                                       src0_row_size_aligned, src0_row_size, pref_block_size);
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
                                       src1_row_size_aligned, src1_row_size, pref_block_size);
        }
    }
    dma_queue_flush(dma_queue);
    t2 = HAP_perf_get_qtimer_count();
-    FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+    FARF(HIGH, "swiglu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
@ -212,16 +171,15 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
                                           struct htp_spad *         dst_spad,
                                           uint32_t                  nth,
                                           uint32_t                  ith,
-                                           uint32_t                  src0_nrows_per_thread,
+                                           uint32_t                  src0_nrows_per_thread) {
                                           dma_queue *               dma_queue) {
    htp_act_preamble3;
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
-    size_t src0_row_size = nb01;
+    const size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
+    const size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
+    const size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
@ -233,110 +191,66 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
        return;
    }
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        FARF(HIGH, "act-f32: unaligned addresses in activations op, possibly slower execution\n");
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
-    const bool src1_valid = src1->ne[0];
+    bool src1_valid = src1->ne[0];
    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
    if (!src1_valid) {
        const int32_t swapped = op_params[1];
        data_src1 = data_src0;
        src1_row_size         = src0_row_size;
        const size_t nc_in_bytes = nc * SIZEOF_FP32;
        data_src0 += swapped ? nc_in_bytes : 0;
        data_src1 += swapped ? 0 : nc_in_bytes;
    }
-    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
-    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
-    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+    const int32_t swapped = op_params[1];
    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
    if (BLOCK == 0) {
        FARF(ERROR,
             "swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
             "%zu\n",
             src0_spad->size_per_thread, src0_row_size_aligned);
        return;
    }
    const float   alpha   = ((const float *) (op_params))[2];
    const float   limit   = ((const float *) (op_params))[3];
-    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+    const int nc = (src1_valid) ? ne00 : ne00 / 2;
    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
-        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
-                                   dst_row_size, dst_row_size_aligned, 0);
+        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
-        dma_queue_push_ddr_to_vtcm(
+        if (ir + 1 < src0_end_row) {
-            dma_queue,
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
            src0_row_size_aligned, src0_row_size, block_size);
        dma_queue_push_ddr_to_vtcm(
            dma_queue,
            dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
            src1_row_size_aligned, src1_row_size, block_size);
        }
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        if (!src1) {
-        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+            src0 += swapped ? nc : 0;
-
+            src1 += swapped ? 0 : nc;
-        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        }
        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
        float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
        for (uint32_t ib = 0; ib < block_size; ib++) {
            const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
            const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
        // x (src0_spad_data) = std::min(src0_p[k], limit);
-            hvx_min_scalar_f32((const uint8_t *) src0_spad_ptr, limit, (uint8_t *) src0_spad_ptr, nc);
+        hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
        // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
-            hvx_clamp_scalar_f32((const uint8_t *) src1_spad_ptr, -limit, limit, (uint8_t *) src1_spad_ptr, nc);
+        hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc);
        // y (src1_spad_data)  = y1 + 1.f
-            hvx_add_scalar_f32((const uint8_t *) src1_spad_ptr, 1.0, (uint8_t *) src1_spad_ptr, nc);
+        hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
        // x1 (dst_spad_data) = alpha * (x)
-            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, alpha, (uint8_t *) dst_spad_ptr, nc);
+        hvx_mul_scalar_f32(src0_spad_data, alpha, dst_spad_data, nc);
-            // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
+        // x2 (dst_spad_data) = expf(-x1)
-            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+        hvx_exp_f32(dst_spad_data, dst_spad_data, nc, true);
-            // out = x * sigmoid(alpha * x) * (y + 1.f)
+        // x3 (dst_spad_data) = x2 + 1.f
-            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
+        hvx_add_scalar_f32(dst_spad_data, 1.0, dst_spad_data, nc);
-                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+        // x4 (dst_spad_data) = 1 / x3
        hvx_inverse_f32(dst_spad_data, dst_spad_data, nc);
        // out_glu(dst_spad_data) = x * x4
        hvx_mul_f32(src0_spad_data, dst_spad_data, dst_spad_data, nc);
        // out = out_glu * (y + 1.f);
        hvx_mul_f32(dst_spad_data, src1_spad_data, (uint8_t *) dst, nc);
    }
        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
                                   dst_row_size_aligned, block_size);
        // prefetch N+2 loop iteration if any
        const uint32_t pref_block = (ir + BLOCK * 2);
        if (pref_block < src0_end_row) {
            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
                                       src0_row_size_aligned, src0_row_size, pref_block_size);
            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
                                       src1_row_size_aligned, src1_row_size, pref_block_size);
        }
    }
    dma_queue_flush(dma_queue);
    t2 = HAP_perf_get_qtimer_count();
-    FARF(HIGH, "swiglu-oai-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
+    FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
         src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
@ -457,8 +371,7 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_spad *         dst_spad,
                                       uint32_t                  nth,
                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
+                                       uint32_t                  src0_nrows_per_thread) {
                                       dma_queue *               dma_queue) {
    htp_act_preamble2;
    uint64_t t1, t2;
@ -466,8 +379,6 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
    const size_t src0_row_size = nb01;
    const size_t dst_row_size  = nb1;
    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
    const uint32_t src0_nrows = ne01 * ne02 * ne03;
@ -479,91 +390,64 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
        return;
    }
-    const uint8_t * data_src0 = (const uint8_t *) src0->data;
+    int is_aligned = 1;
-    uint8_t * data_dst        = (uint8_t *) dst->data;
+    int opt_path   = 0;
-
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
-    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
+        is_aligned = 0;
-    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
-
+    }
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
+        opt_path = 1;
    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
    if (BLOCK == 0) {
        FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
                src0_spad->size_per_thread, src0_row_size_aligned);
        return;
    }
-    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
-        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
-        dma_queue_push_vtcm_to_ddr(dma_queue,
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
            dst_row_size, dst_row_size_aligned, 0);
-        dma_queue_push_ddr_to_vtcm(dma_queue,
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
-            src0_row_size_aligned, src0_row_size, block_size);
+        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
        }
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        if (1 == opt_path) {
-        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, ne0);
            hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
        } else {
            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, ne0, true);
            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
-        float* dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
        float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
        for (uint32_t ib = 0; ib < block_size; ib++) {
            const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
            // silu = x * sigmoid(x)
            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
        }
        dma_queue_push_vtcm_to_ddr(dma_queue,
            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
            dst_row_size, dst_row_size_aligned, block_size);
        // prefetch N+2 loop iteration if any
        const uint32_t pref_block = (ir + BLOCK * 2);
        if (pref_block < src0_end_row) {
            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
            dma_queue_push_ddr_to_vtcm(dma_queue,
                dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
                src0_row_size_aligned, src0_row_size, pref_block_size);
        }
    }
    dma_queue_flush(dma_queue);
    t2 = HAP_perf_get_qtimer_count();
-    FARF(HIGH, "silu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
+    FARF(HIGH, "silu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+                               octx->src0_nrows_per_thread);
 }
 static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
 }
 static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
 }
 static int execute_op_activations_fp32(struct htp_ops_context * octx) {
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@ -2181,11 +2181,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
    const bool has_mask = op->src[3] != nullptr;
-    // note: the non-vec kernel requires more extra memory, so always reserve for it
+    if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
    GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);
    //if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
    if (false) {
        // note: always reserve the padding space to avoid graph reallocations
        //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
        const bool has_kvpad = true;
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1517,12 +1517,10 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
    struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
    graph->n_nodes = n_nodes;
    std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
    tensor_ptrs.reserve(n_tensors);
    for (uint32_t i = 0; i < n_tensors; i++) {
-        tensor_ptrs.emplace(tensors[i].id, &tensors[i]);
+        tensor_ptrs[tensors[i].id] = &tensors[i];
    }
    std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
    tensor_map.reserve(n_nodes);
    for (uint32_t i = 0; i < n_nodes; i++) {
        int64_t id;
        memcpy(&id, &nodes[i], sizeof(id));
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -434,15 +434,8 @@ static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGM
                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
                                                                             GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
                                                                             GGML_OP_RESHAPE };
 static constexpr std::initializer_list<ggml_op> topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY,    GGML_OP_RESHAPE,  GGML_OP_ADD,
                                                                            GGML_OP_ARGSORT,  GGML_OP_VIEW,     GGML_OP_GET_ROWS,
                                                                            GGML_OP_RESHAPE,  GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
                                                                            GGML_OP_DIV,      GGML_OP_RESHAPE };
 static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS };
 static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax      { GGML_OP_ARGSORT,  GGML_OP_VIEW,
                                                                             GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
                                                                             GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
@ -471,32 +464,6 @@ static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softma
    { 9, 0, 8 }, // reshape->src[0]  == div
 };
 //node #436 (     UNARY):     ffn_moe_probs-10 ( 256K) [Vulka         ] use=2:    ffn_moe_logits-10 ( 256K) [Vulka         ]
 //node #437 (   RESHAPE): ffn_moe_probs-10 (re ( 256K) [Vulka         ] use=1:     ffn_moe_probs-10 ( 256K) [Vulka         ]
 //node #438 (       ADD): ffn_moe_probs_biased ( 256K) [Vulka         ] use=1:     ffn_moe_probs-10 ( 256K) [Vulka         ] blk.10.exp_probs_b.b (   0K) [Vulka         ]
 //node #439 (   ARGSORT):   ffn_moe_argsort-10 ( 256K) [Vulka         ] use=1: ffn_moe_probs_biased ( 256K) [Vulka         ]
 //node #440 (      VIEW):      ffn_moe_topk-10 ( 255K) [Vulka         ] use=3:   ffn_moe_argsort-10 ( 256K) [Vulka         ]
 //node #441 (  GET_ROWS):   ffn_moe_weights-10 (  12K) [Vulka         ] use=1: ffn_moe_probs-10 (re ( 256K) [Vulka         ]      ffn_moe_topk-10 ( 255K) [Vulka         ]
 //node #442 (   RESHAPE): ffn_moe_weights-10 ( (  12K) [Vulka         ] use=2:   ffn_moe_weights-10 (  12K) [Vulka         ]
 //node #443 (  SUM_ROWS): ffn_moe_weights_sum- (   2K) [Vulka         ] use=1: ffn_moe_weights-10 ( (  12K) [Vulka         ]
 //node #444 (     CLAMP): ffn_moe_weights_sum_ (   2K) [Vulka         ] use=1: ffn_moe_weights_sum- (   2K) [Vulka         ]
 //node #445 (       DIV): ffn_moe_weights_norm (  12K) [Vulka         ] use=1: ffn_moe_weights-10 ( (  12K) [Vulka         ] ffn_moe_weights_sum_ (   2K) [Vulka         ]
 //node #446 (   RESHAPE): ffn_moe_weights_norm (  12K) [Vulka         ] use=1: ffn_moe_weights_norm (  12K) [Vulka         ]
 static constexpr std::initializer_list<std::array<int, 3>> topk_moe_sigmoid_norm_bias_edges {
    { 1, 0, 0 }, // reshape->src[0]  == sigmoid
    { 2, 0, 0 }, // add->src[0]      == sigmoid
    { 3, 0, 2 }, // argsort->src[0]  == add
    { 4, 0, 3 }, // view->src[0]     == argsort
    { 5, 0, 1 }, // get_rows->src[0] == reshape
    { 5, 1, 4 }, // get_rows->src[1] == view
    { 6, 0, 5 }, // reshape->src[0]  == get_rows
    { 7, 0, 6 }, // sum_rows->src[0] == reshape
    { 8, 0, 7 }, // clamp->src[0]    == sum_rows
    { 9, 0, 6 }, // div->src[0]      == reshape
    { 9, 1, 8 }, // div->src[1]      == clamp
    {10, 0, 9 }, // reshape->src[0]  == div
 };
 // same as early_softmax_norm but ending after the get_rows
 static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges {
    { 1, 0, 0 }, // reshape->src[0]  == softmax
@ -524,10 +491,16 @@ enum topk_moe_mode {
    TOPK_MOE_EARLY_SOFTMAX,
    TOPK_MOE_EARLY_SOFTMAX_NORM,
    TOPK_MOE_LATE_SOFTMAX,
    TOPK_MOE_SIGMOID_NORM_BIAS,
    TOPK_MOE_COUNT,
 };
 static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
    topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
                         num == topk_moe_early_softmax.size() - 1      ? TOPK_MOE_EARLY_SOFTMAX :
                                                                         TOPK_MOE_LATE_SOFTMAX;
    return mode;
 }
 static constexpr std::initializer_list<std::array<int, 3>> rope_view_set_rows_edges {
    { 1, 0, 0 }, // view->src[0]     == rope
    { 2, 0, 1 }, // set_rows->src[0] == view
@ -765,9 +738,6 @@ struct vk_device_struct {
    vk_pipeline pipeline_topk_f32[num_topk_pipelines];
    vk_pipeline pipeline_sum_rows_f32;
    vk_pipeline pipeline_cumsum_f32;
    vk_pipeline pipeline_cumsum_small_f32;
    vk_pipeline pipeline_cumsum_multipass1_f32;
    vk_pipeline pipeline_cumsum_multipass2_f32;
    vk_pipeline pipeline_argmax_f32;
    vk_pipeline pipeline_count_equal_i32;
    std::map<vk_solve_tri_pipeline_state, vk_pipeline> pipeline_solve_tri_f32;
@ -796,7 +766,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_count_experts;
    // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
+    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
    std::vector<vk_pipeline_ref> all_pipelines;
@ -1211,11 +1181,6 @@ struct vk_op_topk_moe_push_constants {
    uint32_t n_expert_used;
    float clamp_min;
    float clamp_max;
    uint32_t gating_func;
    uint32_t has_bias;
    uint32_t with_norm;
    float output_scale;
    float output_bias;
 };
 struct vk_op_add_id_push_constants {
@ -1806,8 +1771,6 @@ struct ggml_backend_vk_context {
    // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
    // If there's no fusion, bit 0 is still set.
    int fused_ops_write_mask {};
    topk_moe_mode fused_topk_moe_mode {};
    bool fused_topk_moe_scale {};
    // for GGML_VK_PERF_LOGGER
    std::unique_ptr<vk_perf_logger> perf_logger;
@ -2705,7 +2668,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
    switch (src0_type) {
    case GGML_TYPE_IQ1_S:
    case GGML_TYPE_IQ1_M:
-        lut_size = 2*2048 + 4*2048;
+        lut_size = 2*2048;
        break;
    case GGML_TYPE_IQ2_XXS:
        lut_size = 8*256;
@ -2898,41 +2861,39 @@ static void ggml_vk_load_shaders(vk_device& device) {
        const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1;
        const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1;
        const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32;
        l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 };
        m_warptile = { 128,  64,  64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
-        s_warptile = { subgroup_size_32, 32,  32, 16, s_warptile_wm,       32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
+        s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
        l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 };
        m_warptile_mmq = { 128,  64,  64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
-        s_warptile_mmq = { subgroup_size_32, 32,  32, 32, s_warptile_wm,       32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
+        s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
        // Integer MMQ has a smaller shared memory profile, but heavier register use
        l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 };
        m_warptile_mmq_int = { 128,  64,  64, 32, subgroup_size_8,     32, 2, 2, 2, 1, subgroup_size_8 };
-        s_warptile_mmq_int = { subgroup_size_32, 32,  32, 32, s_warptile_wm,       32, 2, 2, 1, 1, subgroup_size_8 };
+        s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32,       32, 2, 2, 1, 1, subgroup_size_8 };
        // K-quants use even more registers, mitigate by setting WMITER to 1
        l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 };
        m_warptile_mmq_int_k = { 128,  64,  64, 32, subgroup_size_8,     32, 1, 2, 2, 1, subgroup_size_8 };
-        s_warptile_mmq_int_k = { subgroup_size_32,   32,  32, 32, s_warptile_wm,       32, 1, 2, 1, 1, subgroup_size_8 };
+        s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, 32,       32, 1, 2, 1, 1, subgroup_size_8 };
        l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 };
        m_warptile_id = { 128,  64,  64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 };
-        s_warptile_id = { mul_mat_subgroup_size_16,  32,  32, 16, s_warptile_wm,                32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 };
+        s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 };
        l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 };
        m_warptile_mmqid = { 128,  64,  64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 };
-        s_warptile_mmqid = { mul_mat_subgroup_size_32,   32,  32, 32, s_warptile_wm,               32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 };
+        s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 };
        l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 };
        m_warptile_mmqid_int = { 128,  64,  64, 32, mul_mat_subgroup_size_8,     32, 2, 2, 2, 1, mul_mat_subgroup_size_8 };
-        s_warptile_mmqid_int = { mul_mat_subgroup_size_32,   32,  32, 32, s_warptile_wm,               32, 2, 2, 1, 1, mul_mat_subgroup_size_8 };
+        s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, 32,       32, 2, 2, 1, 1, mul_mat_subgroup_size_8 };
        l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 };
        m_warptile_mmqid_int_k = { 128,  64,  64, 32, mul_mat_subgroup_size_16,     32, 1, 2, 2, 1, mul_mat_subgroup_size_16 };
-        s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32,  32, 32, s_warptile_wm,                32, 1, 2, 1, 1, mul_mat_subgroup_size_16 };
+        s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, 32,       32, 1, 2, 1, 1, mul_mat_subgroup_size_16 };
        // chip specific tuning
        if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
@ -3632,7 +3593,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    uint32_t rm_kq = 2;
    uint32_t rm_stdq_int = 1;
    uint32_t rm_kq_int = 1;
    auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; };
    if (device->vendor_id == VK_VENDOR_ID_AMD) {
        if (device->architecture == AMD_GCN) {
            rm_stdq = 2;
@ -3736,10 +3696,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);
                ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);
            }
 #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
        }
@ -3786,9 +3742,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
        }
 #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
    }
@ -3796,7 +3749,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
    GGML_UNUSED(rm_stdq_int);
    GGML_UNUSED(rm_kq_int);
    GGML_UNUSED(rm_iq_int);
 #endif
    // dequant shaders
@ -4183,11 +4135,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
+    ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size }, 1, true, true, device->subgroup_size);
    ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32,       "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size);
    ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size);
    ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
    ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
    ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
@ -4343,7 +4291,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    for (uint32_t use_push = 0; use_push < 2; ++use_push) {
        for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, use_push}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
        }
    }
@ -5634,8 +5584,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
            case GGML_TYPE_Q4_K:
            case GGML_TYPE_Q5_K:
            case GGML_TYPE_Q6_K:
            case GGML_TYPE_IQ1_S:
            case GGML_TYPE_IQ1_M:
                break;
            default:
                return nullptr;
@ -5792,8 +5740,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
            case GGML_TYPE_Q4_K:
            case GGML_TYPE_Q5_K:
            case GGML_TYPE_Q6_K:
            case GGML_TYPE_IQ1_S:
            case GGML_TYPE_IQ1_M:
                break;
            default:
                return nullptr;
@ -6775,12 +6721,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
    vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
-    const uint32_t num_blocks = CEIL_DIV(ne, pipeline->wg_denoms[0]);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
    // clamp the number of elements to the max workgroup count. The shader will iterate over the total number of blocks.
    const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
    const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 2>{ ne, num_blocks }, { elements, 1, 1 });
    ggml_vk_sync_buffers(ctx, subctx);
 }
@ -7064,7 +7005,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
    // Quantization overhead is not worth it for small k
    switch (device->vendor_id) {
    case VK_VENDOR_ID_NVIDIA:
-        if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) {
+        if (src0_type == GGML_TYPE_Q2_K) {
            return true;
        }
@ -8743,9 +8684,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        if (ctx->num_additional_fused_ops) {
            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
            GGML_ASSERT(idx < num_topk_moe_pipelines);
            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
            // use n_experts from push constant if it's not equal to the power of two spec constant
            bool use_push = dst->ne[0] != (1u << idx);
-            return ctx->device->pipeline_topk_moe[idx][use_push];
+            return ctx->device->pipeline_topk_moe[idx][mode][use_push];
        }
        if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@ -8818,12 +8760,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        return nullptr;
    case GGML_OP_CUMSUM:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            if (src0->ne[0] <= 512) {
                return ctx->device->pipeline_cumsum_small_f32;
            } else {
            return ctx->device->pipeline_cumsum_f32;
        }
        }
        return nullptr;
    case GGML_OP_SOLVE_TRI:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
@ -10408,16 +10346,14 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
 }
 static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
-    topk_moe_mode mode = ctx->fused_topk_moe_mode;
+    topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
    ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
-    ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits;
+    ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
-    ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
+                            (mode == TOPK_MOE_EARLY_SOFTMAX)      ? cgraph->nodes[node_idx + 4] :
-    ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] :
+                                                                    cgraph->nodes[node_idx + 5];
-                        (mode == TOPK_MOE_LATE_SOFTMAX) ?      cgraph->nodes[node_idx + 1] :
+    ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
                                                               cgraph->nodes[node_idx + 3];
    GGML_ASSERT(logits->type == GGML_TYPE_F32);
    GGML_ASSERT(bias->type == GGML_TYPE_F32);
    GGML_ASSERT(weights->type == GGML_TYPE_F32);
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
@ -10432,7 +10368,6 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
    vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
    vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias);
    vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
    vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
@ -10440,45 +10375,18 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
    pc.n_rows = n_rows;
    pc.n_experts_push = n_experts;
    pc.n_expert_used = n_expert_used;
    pc.clamp_min = -std::numeric_limits<float>::infinity();
    pc.clamp_max = std::numeric_limits<float>::infinity();
    if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
        ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
        GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
        pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
        pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
    }
    if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) {
        ggml_tensor * clamp = cgraph->nodes[node_idx + 8];
        GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
        pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
        pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
    }
 #define GATING_FUNC_SOFTMAX 0
 #define GATING_FUNC_SIGMOID 1
 #define GATING_FUNC_SOFTMAX_WEIGHT 2
    pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID :
                     mode == TOPK_MOE_LATE_SOFTMAX ?      GATING_FUNC_SOFTMAX_WEIGHT :
                                                          GATING_FUNC_SOFTMAX;
    pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS;
    pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS;
    if (ctx->fused_topk_moe_scale) {
        GGML_ASSERT(weights->op == GGML_OP_SCALE);
        pc.output_scale = ggml_get_op_params_f32(weights, 0);
        pc.output_bias = ggml_get_op_params_f32(weights, 1);
    } else {
        pc.output_scale = 1.0f;
        pc.output_bias = 0.0f;
    }
    GGML_ASSERT(n_expert_used <= n_experts);
    const uint32_t rows_per_block = 4;
    std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, weights_buf, ids_buf}, pc, elements);
 }
 static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
@ -10726,50 +10634,8 @@ static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, cons
 }
 static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
+    vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
-    // Use the single pass shader when the rows are small or there are enough rows to fill the GPU.
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, p);
    // For fewer, larger rows, use the multipass shader to spread each row across SMs.
    if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) {
        ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc);
        return;
    }
    // First pass computes partial sums within a block, and stores the last partial
    // to the temp buffer. Second pass sums the block partials from the temp buffer
    // and adds that to the result of the first pass.
    vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32;
    vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32;
    GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr);
    ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
    ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
    std::array<uint32_t, 3> elements;
    elements[0] = dst->ne[0];
    elements[1] = (uint32_t)ggml_nrows(dst);
    elements[2] = 1;
    size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst);
    if (ctx->prealloc_size_split_k < temp_size) {
        ctx->prealloc_size_split_k = temp_size;
        ggml_vk_preallocate_buffers(ctx, subctx);
    }
    vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
    vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
    if (ctx->prealloc_split_k_need_sync) {
        ggml_vk_sync_buffers(ctx, subctx);
    }
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements);
    ggml_vk_sync_buffers(ctx, subctx);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements);
    ctx->prealloc_split_k_need_sync = true;
 }
 static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@ -12262,11 +12128,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        break;
    case GGML_OP_UNARY:
        if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
            break;
        }
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_EXP:
        case GGML_UNARY_OP_SILU:
@ -12314,7 +12175,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        break;
    case GGML_OP_SOFT_MAX:
-        if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
+        if (ctx->num_additional_fused_ops) {
            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
        } else {
            ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node);
@ -12334,7 +12195,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        break;
    case GGML_OP_ARGSORT:
-        if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
+        if (ctx->num_additional_fused_ops) {
            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
        } else {
            ggml_vk_argsort(ctx, compute_ctx, src0, node);
@ -13187,24 +13048,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
        get_rows = cgraph->nodes[node_idx + 4];
        argsort = cgraph->nodes[node_idx + 2];
        break;
    case TOPK_MOE_SIGMOID_NORM_BIAS:
        softmax = cgraph->nodes[node_idx + 0]; // really sigmoid
        weights = cgraph->nodes[node_idx + 10];
        get_rows = cgraph->nodes[node_idx + 5];
        argsort = cgraph->nodes[node_idx + 3];
        if (ggml_get_unary_op(softmax) != GGML_UNARY_OP_SIGMOID) {
            return false;
        }
        // bias is expected to be 1D
        if (ggml_nrows(cgraph->nodes[node_idx + 2]->src[1]) != 1 ||
            !ggml_is_contiguous(cgraph->nodes[node_idx + 2]->src[1])) {
            return false;
        }
        // sigmoid fusion seems to generate infinities on moltenvk
        if (ctx->device->driver_id == vk::DriverId::eMoltenvk) {
            return false;
        }
        break;
    case TOPK_MOE_EARLY_SOFTMAX:
        softmax = cgraph->nodes[node_idx + 0];
        weights = cgraph->nodes[node_idx + 4];
@ -13228,20 +13071,19 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
    probs = probs->src[0];
    ggml_tensor * selection_probs = argsort->src[0];
-    if (probs != selection_probs && mode != TOPK_MOE_SIGMOID_NORM_BIAS) {
+    if (probs != selection_probs) {
        return false;
    }
    if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
        return false;
    }
    if (softmax->op == GGML_OP_SOFT_MAX) {
    const float * op_params = (const float *)softmax->op_params;
    float scale = op_params[0];
    float max_bias = op_params[1];
    if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
        return false;
    }
    if (scale != 1.0f || max_bias != 0.0f) {
        return false;
    }
@ -13250,7 +13092,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
    if (softmax->src[1] || softmax->src[2]) {
        return false;
    }
    }
    const int n_expert = softmax->ne[0];
    if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
@ -13522,8 +13363,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
            total_mul_mat_bytes += bytes;
        }
        ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
        ctx->fused_topk_moe_scale = false;
        const char *fusion_string {};
        if (!ctx->device->disable_fusion) {
            uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
@ -13569,23 +13408,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
                // view of argsort writes to memory
                ctx->fused_ops_write_mask |= 1 << 3;
                ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
                fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
                ctx->num_additional_fused_ops = topk_moe_sigmoid_norm_bias.size() - 1;
                // view of argsort writes to memory
                ctx->fused_ops_write_mask |= 1 << 4;
                ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
                fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
                // view of argsort writes to memory
                ctx->fused_ops_write_mask |= 1 << 3;
                ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
                fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
@ -13593,17 +13422,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
                // view of argsort writes to memory
                ctx->fused_ops_write_mask |= 1 << 1;
                ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
                fusion_string = "TOPK_MOE_LATE_SOFTMAX";
            }
            if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
                // Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
                if (ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops - 1, { GGML_OP_DIV, GGML_OP_RESHAPE, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 }) ||
                    ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
                    ctx->fused_topk_moe_scale = true;
                    ctx->num_additional_fused_ops++;
                }
            }
        }
        ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@ -13782,9 +13602,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
        if (keep_pattern(topk_moe_early_softmax_norm)) {
            continue;
        }
        if (keep_pattern(topk_moe_sigmoid_norm_bias)) {
            continue;
        }
        if (keep_pattern(topk_moe_early_softmax)) {
            continue;
        }
@ -13811,7 +13628,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
            }
            // Don't pull forward nodes from fusion patterns
            if (match_pattern(topk_moe_early_softmax_norm, j) ||
                match_pattern(topk_moe_sigmoid_norm_bias, j) ||
                match_pattern(topk_moe_early_softmax, j) ||
                match_pattern(topk_moe_late_softmax, j)) {
                continue;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp
@ -14,7 +14,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 layout (constant_id = 0) const uint BLOCK_SIZE = 128;
 layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
 layout (constant_id = 2) const uint ELEM_PER_THREAD = 4;
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
@ -39,45 +38,32 @@ void main() {
        last_sum = 0;
    }
-    uint col = tid * ELEM_PER_THREAD;
+    uint col = tid;
-    uint num_iter = CEIL_DIV(p.n_cols, BLOCK_SIZE * ELEM_PER_THREAD);
+    uint num_iter = CEIL_DIV(p.n_cols, BLOCK_SIZE);
    for (int i = 0; i < num_iter; ++i) {
-        FLOAT_TYPE v[ELEM_PER_THREAD];
+        FLOAT_TYPE v = 0;
-        FLOAT_TYPE thread_sum = 0;
+        if (col < p.n_cols) {
-        [[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
+            v = FLOAT_TYPE(data_a[src_idx + col]);
            if (col + j < p.n_cols) {
                thread_sum += FLOAT_TYPE(data_a[src_idx + col + j]);
            }
            v[j] = thread_sum;
        }
        v = subgroupInclusiveAdd(v);
        thread_sum = subgroupExclusiveAdd(thread_sum);
        [[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
            v[j] += thread_sum;
        }
        // Store the largest partial sum for each subgroup, then add the partials for all
        // lower subgroups and the final partial sum from the previous iteration.
        if (gl_SubgroupInvocationID == SUBGROUP_SIZE - 1) {
-            partial[subgroup_id] = v[ELEM_PER_THREAD - 1];
+            partial[subgroup_id] = v;
        }
        barrier();
-        for (int s = 0; s < subgroup_id; ++s) {
+        for (int j = 0; j < subgroup_id; ++j) {
-            [[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
+            v += partial[j];
                v[j] += partial[s];
            }
        }
        [[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
            v[j] += last_sum;
        }
        v += last_sum;
        barrier();
        if (tid == BLOCK_SIZE - 1) {
-            last_sum = v[ELEM_PER_THREAD - 1];
+            last_sum = v;
        }
-        [[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
+        if (col < p.n_cols) {
-            if (col + j < p.n_cols) {
+            data_d[dst_idx + col] = D_TYPE(v);
                data_d[dst_idx + col + j] = D_TYPE(v[j]);
        }
-        }
+        col += BLOCK_SIZE;
        col += BLOCK_SIZE * ELEM_PER_THREAD;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp
@ -1,60 +0,0 @@
 #version 450
 #include "types.glsl"
 #include "sum_rows.glsl"
 #extension GL_EXT_control_flow_attributes : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_KHR_shader_subgroup_basic : enable
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 layout (binding = 2) writeonly buffer T {D_TYPE data_t[];};
 layout (constant_id = 0) const uint BLOCK_SIZE = 128;
 layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 shared FLOAT_TYPE partial[BLOCK_SIZE / SUBGROUP_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.y;
    const uint tid = gl_LocalInvocationID.x;
    const uint col = gl_GlobalInvocationID.x;
    const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L);
    const uint i03_offset = i03 * p.ne01*p.ne02;
    const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L);
    const uint i01 = row - i03_offset - i02*p.ne01;
    const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
    const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13;
    uint subgroup_id = tid / SUBGROUP_SIZE;
    FLOAT_TYPE v = 0;
    if (col < p.n_cols) {
        v = FLOAT_TYPE(data_a[src_idx + col]);
    }
    v = subgroupInclusiveAdd(v);
    // Store the largest partial sum for each subgroup, then add the partials for all
    // lower subgroups and the final partial sum from the previous iteration.
    if (gl_SubgroupInvocationID == SUBGROUP_SIZE - 1) {
        partial[subgroup_id] = v;
    }
    barrier();
    for (int j = 0; j < subgroup_id; ++j) {
        v += partial[j];
    }
    barrier();
    if (tid == BLOCK_SIZE - 1) {
        data_t[gl_WorkGroupID.x + gl_NumWorkGroups.x * row] = v;
    }
    if (col < p.n_cols) {
        data_d[dst_idx + col] = D_TYPE(v);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp
@ -1,66 +0,0 @@
 #version 450
 #include "types.glsl"
 #include "sum_rows.glsl"
 #extension GL_EXT_control_flow_attributes : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_KHR_shader_subgroup_basic : enable
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) buffer D {D_TYPE data_d[];};
 layout (binding = 2) readonly buffer T {D_TYPE data_t[];};
 layout (constant_id = 0) const uint BLOCK_SIZE = 128;
 layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 shared FLOAT_TYPE temp[BLOCK_SIZE / SUBGROUP_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.y;
    const uint tid = gl_LocalInvocationID.x;
    const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L);
    const uint i03_offset = i03 * p.ne01*p.ne02;
    const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L);
    const uint i01 = row - i03_offset - i02*p.ne01;
    const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
    const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13;
    const uint col = gl_GlobalInvocationID.x;
    float v = 0;
    // prefetch value we're adding to
    if (col < p.n_cols) {
        v = data_d[dst_idx + col];
    }
    // compute the sum of all previous blocks
    uint c = tid;
    float sum = 0;
    while (c < gl_WorkGroupID.x) {
        sum += data_t[c + gl_NumWorkGroups.x * row];
        c += BLOCK_SIZE;
    }
    sum = subgroupAdd(sum);
    if (gl_SubgroupInvocationID == 0) {
        temp[gl_SubgroupID] = sum;
    }
    barrier();
    sum = 0;
    [[unroll]] for (uint s = 0; s < BLOCK_SIZE / SUBGROUP_SIZE; ++s) {
        sum += temp[s];
    }
    // Add the sum to what the first pass computed
    if (col < p.n_cols) {
        data_d[dst_idx + col] = v + sum;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
@ -14,8 +14,6 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 #define K_PER_ITER 8
 #elif defined(DATA_A_QUANT_K)
 #define K_PER_ITER 16
 #elif defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M)
 #define K_PER_ITER 32
 #else
 #error unimplemented
 #endif
@ -51,15 +49,6 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
        cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 1];
        cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 2];
        cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 3];
 #elif K_PER_ITER == 32
        cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8    ];
        cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 1];
        cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 2];
        cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 3];
        cache_b_qs[4] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 4];
        cache_b_qs[5] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 5];
        cache_b_qs[6] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 6];
        cache_b_qs[7] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 7];
 #else
 #error unimplemented
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
@ -377,118 +377,3 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
    return FLOAT_TYPE(float(cache_b_ds.x) * float(d_scale) * float(q_sum));
 }
 #endif
 #if defined(DATA_A_IQ1_S)
 void repack8(uint ib, uint iqs, out i32vec4 out0, out i32vec4 out1) {
    const uint ib32 = iqs / 32;
    const uint qh = data_a[ib].qh[ib32];
    const uint qs16_0 = data_a_packed16[ib].qs[(4 * ib32 + 0) / 2];
    const uint qs16_1 = data_a_packed16[ib].qs[(4 * ib32 + 2) / 2];
    const uint qs0 = qs16_0 & 0xFF;
    const uint qs1 = qs16_0 >> 8;
    const uint qs2 = qs16_1 & 0xFF;
    const uint qs3 = qs16_1 >> 8;
    const uint hi0 = bitfieldExtract(qh, 3 * int(0), 3);
    const uint hi1 = bitfieldExtract(qh, 3 * int(1), 3);
    const uint hi2 = bitfieldExtract(qh, 3 * int(2), 3);
    const uint hi3 = bitfieldExtract(qh, 3 * int(3), 3);
    const int32_t grid0 = int32_t(iq1s_grid_gpu[qs0 | (hi0 << 8)]);
    const int32_t grid1 = int32_t(iq1s_grid_gpu[qs1 | (hi1 << 8)]);
    const int32_t grid2 = int32_t(iq1s_grid_gpu[qs2 | (hi2 << 8)]);
    const int32_t grid3 = int32_t(iq1s_grid_gpu[qs3 | (hi3 << 8)]);
    out0 = i32vec4((grid0 >> 0) & 0x0F0F0F0F,
                   (grid0 >> 4) & 0x0F0F0F0F,
                   (grid1 >> 0) & 0x0F0F0F0F,
                   (grid1 >> 4) & 0x0F0F0F0F);
    out1 = i32vec4((grid2 >> 0) & 0x0F0F0F0F,
                   (grid2 >> 4) & 0x0F0F0F0F,
                   (grid3 >> 0) & 0x0F0F0F0F,
                   (grid3 >> 4) & 0x0F0F0F0F);
 }
 vec2 get_dm(uint ib, uint iqs) {
    const uint ib32 = iqs / 32;
    const uint qh = data_a[ib].qh[ib32];
    const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
    const float d = float(data_a[ib].d);
    const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
    // the -1 cancels out the bias in iq1s_grid_gpu
    return FLOAT_TYPE_VEC2(dl, dl * (delta - 1));
 }
 FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
    int32_t q_sum = 0;
    const uint ib_k = ib_a / 8;
    const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;
    i32vec4 qs_a0;
    i32vec4 qs_a1;
    repack8(ib_k, iqs_k, qs_a0, qs_a1);
    const vec2 dm = get_dm(ib_k, iqs_k);
    q_sum += dotPacked4x8EXT(qs_a0.x, cache_b_qs[0]);
    q_sum += dotPacked4x8EXT(qs_a0.y, cache_b_qs[1]);
    q_sum += dotPacked4x8EXT(qs_a0.z, cache_b_qs[2]);
    q_sum += dotPacked4x8EXT(qs_a0.w, cache_b_qs[3]);
    q_sum += dotPacked4x8EXT(qs_a1.x, cache_b_qs[4]);
    q_sum += dotPacked4x8EXT(qs_a1.y, cache_b_qs[5]);
    q_sum += dotPacked4x8EXT(qs_a1.z, cache_b_qs[6]);
    q_sum += dotPacked4x8EXT(qs_a1.w, cache_b_qs[7]);
    return FLOAT_TYPE(float(cache_b_ds.x) * float(dm.x) * float(q_sum) + float(dm.y) * float(cache_b_ds.y));
 }
 #endif
 #if defined(DATA_A_IQ1_M)
 FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
    const uint ib_k = ib_a / 8;
    const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;
    const uint ib32 = iqs_k / 32;
    const uint ib64 = ib32 / 2;
    const uint16_t[4] scales = data_a[ib_k].scales;
    const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
    const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
    const uint qs32 = data_a_packed32[ib_k].qs[ib32];
    const uint qh16 = data_a_packed16[ib_k].qh[ib32];
    float sum = 0;
    const uint sc = data_a[ib_k].scales[ib64];
    [[unroll]] for (int l = 0; l < 4; ++l) {
        const uint ib16 = 2 * ib32 + l / 2;
        const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
        const uint qh = qh16 >> (4 * l);
        const uint qs = (qs32 >> (8 * l)) & 0xFF;
        const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
        const int32_t grid = int32_t(iq1s_grid_gpu[qs | ((qh & 7) << 8)]);
        int32_t q_sum = 0;
        q_sum += dotPacked4x8EXT((grid >> 0) & 0x0F0F0F0F, cache_b_qs[2 * l + 0]);
        q_sum += dotPacked4x8EXT((grid >> 4) & 0x0F0F0F0F, cache_b_qs[2 * l + 1]);
        int32_t y_sum = 0;
        y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 0]);
        y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 1]);
        // the -1 cancels out the bias in iq1s_grid_gpu
        sum += dl * (q_sum + y_sum * (delta - 1));
    }
    sum *= float(cache_b_ds.x);
    return sum;
 }
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
@ -15,7 +15,6 @@
 layout (push_constant) uniform parameter
 {
    uint ne;
    uint num_blocks;
 } p;
 #include "types.glsl"
@ -34,7 +33,8 @@ layout (binding = 1) writeonly buffer D {block_q8_1_x4 data_b[];};
 shared float shmem[GROUP_SIZE];
 #endif
-void quantize(const uint wgid) {
+void quantize() {
    const uint wgid = gl_WorkGroupID.x;
    const uint tid = INVOCATION_ID;
    // Each thread handles a vec4, so 8 threads handle a block
@ -45,7 +45,11 @@ void quantize(const uint wgid) {
    const uint ib = wgid * blocks_per_group + block_in_wg;
    const uint iqs = tid % 8;
-#ifdef QBLOCK_X4
+#ifndef QBLOCK_X4
    if (ib >= gl_NumWorkGroups.x * blocks_per_group) {
        return;
    }
 #else
    const uint ibx4_outer = ib / 4;
    const uint ibx4_inner = ib % 4;
@ -119,9 +123,5 @@ void quantize(const uint wgid) {
 }
 void main() {
-    uint wgid = gl_WorkGroupID.x;
+    quantize();
    while (wgid < p.num_blocks) {
        quantize(wgid);
        wgid += gl_NumWorkGroups.x;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@ -7,10 +7,6 @@
 #include "types.glsl"
 #define GATING_FUNC_SOFTMAX 0
 #define GATING_FUNC_SIGMOID 1
 #define GATING_FUNC_SOFTMAX_WEIGHT 2
 layout (push_constant) uniform parameter
 {
    uint n_rows;
@ -18,18 +14,15 @@ layout (push_constant) uniform parameter
    uint n_expert_used;
    float clamp_min;
    float clamp_max;
    uint gating_func;
    uint has_bias;
    uint with_norm;
    float output_scale;
    float output_bias;
 };
 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 layout(constant_id = 0) const uint WARP_SIZE = 32;
 layout(constant_id = 1) const uint n_experts_spec = 512;
-layout(constant_id = 2) const bool nexperts_use_push = false;
+layout(constant_id = 2) const bool with_norm = true;
 layout(constant_id = 3) const bool late_softmax = false;
 layout(constant_id = 4) const bool nexperts_use_push = false;
 uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
@ -38,9 +31,8 @@ uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
 const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
 layout (binding = 0, std430) readonly buffer Logits {float logits[];};
-layout (binding = 1, std430) readonly buffer BiasProbs {float bias[];};
+layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
-layout (binding = 2, std430) writeonly buffer Weights {float weights[];};
+layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
 layout (binding = 3, std430) writeonly buffer Ids {uint ids[];};
 const float INFINITY = 1.0 / 0.0;
@ -95,45 +87,20 @@ void main() {
    }
    const uint logits_offset = n_experts * row;
    const uint bias_offset = 0; // 1D
    const uint weights_offset = n_expert_used * row;
    const uint ids_offset = n_experts * row;
    const uint lane = gl_SubgroupInvocationID;
-    float probs[experts_per_thread];
+    float wt[experts_per_thread];
    [[unroll]]
    for (int i = 0; i < experts_per_thread; i++) {
        probs[i] = -INFINITY;
    }
    [[unroll]]
    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
        const uint expert = i + lane;
-        probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
+        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
    }
-    if (gating_func == GATING_FUNC_SOFTMAX) {
+    if (!late_softmax) {
-        softmax_warp_inplace(probs, n_experts, lane, nexperts_use_push);
+        softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
    } else if (gating_func == GATING_FUNC_SIGMOID) {
        [[unroll]]
        for (uint i = 0; i < n_experts; i += WARP_SIZE) {
            const uint expert = i + lane;
            probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? 1.f / (1.f + exp(-probs[i / WARP_SIZE])) : -INFINITY;
        }
    }
    float selection_probs[experts_per_thread];
    if (has_bias != 0) {
        [[unroll]]
        for (uint i = 0; i < n_experts; i += WARP_SIZE) {
            const uint expert = i + lane;
            selection_probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? probs[i / WARP_SIZE] + bias[bias_offset + expert] : -INFINITY;
        }
    } else {
        [[unroll]]
        for (int i = 0; i < experts_per_thread; i++) {
            selection_probs[i] = probs[i];
        }
    }
    // at this point, each thread holds a portion of softmax,
@ -150,16 +117,14 @@ void main() {
    }
    for (int k = 0; k < n_expert_used; k++) {
-        float max_val    = probs[0];
+        float max_val    = wt[0];
        float max_val_s  = selection_probs[0];
        uint   max_expert = lane;
        [[unroll]]
-        for (uint i = WARP_SIZE; i < n_experts; i += WARP_SIZE) {
+        for (int i = 1; i < experts_per_thread; i++) {
-            const uint expert = i + lane;
+            const uint expert = lane + i * WARP_SIZE;
-            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i / WARP_SIZE] > max_val_s) {
+            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
-                max_val    = probs[i / WARP_SIZE];
+                max_val    = wt[i];
                max_val_s  = selection_probs[i / WARP_SIZE];
                max_expert = expert;
            }
        }
@ -167,11 +132,9 @@ void main() {
        [[unroll]]
        for (uint mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
            const float val    = subgroupShuffleXor(max_val, mask);
            const float val_s  = subgroupShuffleXor(max_val_s, mask);
            const uint  expert = subgroupShuffleXor(max_expert, mask);
-            if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) {
+            if (val > max_val || (val == max_val && expert < max_expert)) {
                max_val    = val;
                max_val_s  = val_s;
                max_expert = expert;
            }
        }
@ -181,14 +144,16 @@ void main() {
        }
        if ((max_expert & (WARP_SIZE - 1)) == lane) {
-            selection_probs[max_expert / WARP_SIZE] = -INFINITY;
+            wt[max_expert / WARP_SIZE] = -INFINITY;
            ids[ids_offset + k] = max_expert;
            if (with_norm) {
                wt_sum += max_val;
            }
        }
    }
-    if (with_norm != 0) {
+    if (with_norm) {
        wt_sum              = subgroupAdd(wt_sum);
        wt_sum              = clamp(wt_sum, clamp_min, clamp_max);
        const float inv_sum = 1.0f / wt_sum;
@ -199,7 +164,7 @@ void main() {
        }
    }
-    if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) {
+    if (late_softmax) {
        softmax_warp_inplace(output_weights, n_expert_used, lane, true);
    }
@ -207,7 +172,7 @@ void main() {
    for (uint i = 0; i < experts_per_thread; ++i) {
        uint idx = i * WARP_SIZE + lane;
        if (idx < n_expert_used) {
-            weights[weights_offset + idx] = output_scale * output_weights[i] + output_bias;
+            weights[weights_offset + idx] = output_weights[i];
        }
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
@ -396,12 +396,6 @@ struct block_iq1_s {
    uint16_t qh[QUANT_K_IQ1_S/32];
 };
 struct block_iq1_s_packed16 {
    float16_t d;
    uint16_t qs[QUANT_K_IQ1_S/8/2];
    uint16_t qh[QUANT_K_IQ1_S/32];
 };
 #define QUANT_K_IQ1_M 256
 #define QUANT_R_IQ1_M 1
@ -411,18 +405,6 @@ struct block_iq1_m {
    uint16_t scales[QUANT_K_IQ1_M/64];
 };
 struct block_iq1_m_packed16 {
    uint16_t qs[QUANT_K_IQ1_M/8/2];
    uint16_t qh[QUANT_K_IQ1_M/16/2];
    uint16_t scales[QUANT_K_IQ1_M/64];
 };
 struct block_iq1_m_packed32 {
    uint32_t qs[QUANT_K_IQ1_M/8/4];
    uint32_t qh[QUANT_K_IQ1_M/16/4];
    uint32_t scales[QUANT_K_IQ1_M/64/2];
 };
 struct block_iq1_m_packed64 {
    uint64_t  qs[QUANT_K_IQ1_M/8/8];
    uint64_t  qh[QUANT_K_IQ1_M/16/8];
@ -433,15 +415,12 @@ struct block_iq1_m_packed64 {
 #define QUANT_K QUANT_K_IQ1_S
 #define QUANT_R QUANT_R_IQ1_S
 #define A_TYPE block_iq1_s
 #define A_TYPE_PACKED16 block_iq1_s_packed16
 #endif
 #if defined(DATA_A_IQ1_M)
 #define QUANT_K QUANT_K_IQ1_M
 #define QUANT_R QUANT_R_IQ1_M
 #define A_TYPE block_iq1_m
 #define A_TYPE_PACKED16 block_iq1_m_packed16
 #define A_TYPE_PACKED32 block_iq1_m_packed32
 #endif
 #if defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M)
@ -580,270 +559,7 @@ const uint[1024] iq1s_grid_const = {
    0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557
 };
 // Same content as iq1s_grid_const except each 2-bit value is expanded to 4-bit
 // and has 1 added to it (allows packed values to be extracted with & 0x0F0F0F0F
 // and 0xF0F0F0F0).
 const uint32_t[2048] iq1s_grid_gpu_const = {
    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
 };
 shared uint16_t iq1s_grid[2048];
 shared uint32_t iq1s_grid_gpu[2048];
 #define NEEDS_INIT_IQ_SHMEM
 void init_iq_shmem(uvec3 wgsize)
@ -857,12 +573,6 @@ void init_iq_shmem(uvec3 wgsize)
            iq1s_grid[2*idx+1] = g.y;
        }
    }
    [[unroll]] for (uint i = 0; i < iq1s_grid_gpu_const.length(); i += wgsize.x) {
        uint idx = i + gl_LocalInvocationIndex.x;
        if (iq1s_grid_gpu_const.length() % wgsize.x == 0 || idx < iq1s_grid_gpu_const.length()) {
            iq1s_grid_gpu[idx] = iq1s_grid_gpu_const[idx];
        }
    }
    barrier();
 }
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -685,7 +685,7 @@ void process_shaders() {
        // mul mat vec with integer dot product
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
-        if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname) || tname == "iq1_s" || tname == "iq1_m") {
+        if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname)) {
            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
@ -944,8 +944,6 @@ void process_shaders() {
    string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
    string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("cumsum_multipass2_f32", "cumsum_multipass2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("count_experts", "count_experts.comp", merge_maps(base_dict, {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}}));
@ -1125,7 +1123,7 @@ void write_output_files() {
    for (const std::string& btype : btypes) {
    for (const auto& tname : type_names) {
-        if (btype == "q8_1" && !is_legacy_quant(tname) && tname != "mxfp4" && !is_k_quant(tname) && tname != "iq1_s" && tname != "iq1_m") {
+        if (btype == "q8_1" && !is_legacy_quant(tname) && tname != "mxfp4" && !is_k_quant(tname)) {
            continue;
        }
        hdr << "extern const void * arr_dmmv_"   << tname << "_" << btype << "_f32_data[3];\n";
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -2273,16 +2273,6 @@ static void ggml_webgpu_init_unary_pipeline(webgpu_context & webgpu_ctx) {
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f32, "xielu_inplace_f32", constants);
    webgpu_ctx->unary_pipelines[GGML_UNARY_OP_XIELU][GGML_TYPE_F16][1] =
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f16, "xielu_inplace_f16", constants);
    // CEIL
    webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][0] =
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f32, "ceil_f32", constants);
    webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][0] =
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f16, "ceil_f16", constants);
    webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][1] =
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f32, "ceil_inplace_f32", constants);
    webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][1] =
        ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f16, "ceil_inplace_f16", constants);
 }
 static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
@ -2538,7 +2528,6 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                    case GGML_UNARY_OP_EXP:
                    case GGML_UNARY_OP_GELU_ERF:
                    case GGML_UNARY_OP_XIELU:
                    case GGML_UNARY_OP_CEIL:
                        supports_op = supports_op =
                            (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
                        break;
--- a/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl
@ -16,8 +16,7 @@
    "HARDSWISH_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
    "GELU_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
    "GELU_QUICK_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
-    "GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
+    "GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458"
    "CEIL_FUNC": "{{MUTATE}}[dst_i] = ceil(src[src_i]);"
 }
 #end(REPL_TEMPLATES)
@ -358,27 +357,6 @@
        "SHADER_NAME": "gelu_erf_inplace_f16",
        "REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
        "DECLS": ["INPLACE"]
    },
    {
        "SHADER_NAME": "ceil_f32",
        "REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
        "DECLS": ["NOT_INPLACE"]
    },
    {
        "SHADER_NAME": "ceil_f16",
        "REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
        "DECLS": ["NOT_INPLACE"]
    },
    {
        "SHADER_NAME": "ceil_inplace_f32",
        "REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
        "DECLS": ["INPLACE"]
    },
    {
        "SHADER_NAME": "ceil_inplace_f16",
        "REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
        "DECLS": ["INPLACE"]
    }
 ]
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -53,15 +53,13 @@
 #define UNUSED GGML_UNUSED
 // Needed for ggml_fp32_to_bf16_row()
 #if defined(__AVX512BF16__)
 #if defined(_MSC_VER)
 #define m512bh(p) p
 #define m512i(p) p
 #else
-#include <immintrin.h>
+#define m512bh(p) (__m512bh)(p)
 #define m512i(p) (__m512i)(p)
-#endif // defined(_MSC_VER)
+#endif
 #endif // defined(__AVX512BF16__)
 #if defined(__linux__) || \
    defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -104,7 +104,6 @@ class Keys:
        VOCAB_SIZE                        = "{arch}.vocab_size"
        CONTEXT_LENGTH                    = "{arch}.context_length"
        EMBEDDING_LENGTH                  = "{arch}.embedding_length"
        EMBEDDING_LENGTH_OUT              = "{arch}.embedding_length_out"
        FEATURES_LENGTH                   = "{arch}.features_length"
        BLOCK_COUNT                       = "{arch}.block_count"
        LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
@ -295,9 +294,7 @@ class Keys:
        USE_GELU            = "clip.use_gelu"
        USE_SILU            = "clip.use_silu"
        N_WA_PATTERN        = "clip.vision.n_wa_pattern" # used by qwen2.5vl
        WA_LAYER_INDEXES    = "clip.vision.wa_layer_indexes" # used by youtuvl
        IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
        WINDOW_SIZE         = "clip.vision.window_size"
        class Attention:
            HEAD_COUNT      = "clip.vision.attention.head_count"
@ -455,7 +452,6 @@ class MODEL_ARCH(IntEnum):
    MISTRAL3         = auto()
    MIMO2            = auto()
    LLAMA_EMBED      = auto()
    MAINCODER        = auto()
 class VISION_PROJECTOR_TYPE(IntEnum):
@ -854,7 +850,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.MISTRAL3:         "mistral3",
    MODEL_ARCH.MIMO2:            "mimo2",
    MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
    MODEL_ARCH.MAINCODER:        "maincoder",
 }
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -3039,7 +3034,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
    ],
    MODEL_ARCH.LFM2MOE: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -3263,22 +3257,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.MAINCODER: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    # TODO
 }
@ -3514,9 +3492,7 @@ class VisionProjectorType:
    COGVLM = "cogvlm"
    JANUS_PRO = "janus_pro"
    LFM2A = "lfm2a" # audio
    MUSIC_FLAMINGO = "musicflamingo" # audio
    GLM4V = "glm4v"
    YOUTUVL = "youtuvl"
 # Items here are (block size, type size)
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -681,9 +681,6 @@ class GGUFWriter:
    def add_embedding_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
    def add_embedding_length_out(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH_OUT.format(arch=self.arch), length)
    def add_features_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
@ -1132,40 +1129,11 @@ class GGUFWriter:
        self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
    def add_vision_n_wa_pattern(self, value: int) -> None:
        """Add window attention pattern interval for vision models.
        This defines the pattern interval for window attention vs full attention layers.
        For example, if n_wa_pattern=4, then layers 3, 7, 11, ... use full attention,
        while other layers use window attention.
        Used by models like Qwen2.5-VL where full attention layers follow a regular pattern.
        """
        self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
    def add_vision_wa_layer_indexes(self, layers: Sequence[int]) -> None:
        """Add explicit layer indexes that use full attention in vision models.
        This specifies the exact layer indices (0-based) that should use full attention
        instead of window attention. All other layers will use window attention.
        Args:
            layers: List of layer indices that use full attention (e.g., [3, 7, 11, 15])
        Used by models like YoutuVL where full attention layers are explicitly specified
        rather than following a regular pattern.
        Difference from add_vision_n_wa_pattern:
        - n_wa_pattern: Defines a regular interval pattern (every Nth layer uses full attention)
        - wa_layer_indexes: Explicitly lists which layers use full attention (irregular pattern)
        """
        self.add_array(Keys.ClipVision.WA_LAYER_INDEXES, layers)
    def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
        self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
    def add_vision_window_size(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
    # audio models
    def add_audio_projection_dim(self, value: int) -> None:
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -1221,7 +1221,6 @@ class TensorNameMap:
        MODEL_TENSOR.V_MMPROJ: (
            "multi_modal_projector.linear_{bid}",
            "visual.merger.mlp.{bid}", # qwen2vl
            "merger.mlp.{bid}",
        ),
        MODEL_TENSOR.V_MMPROJ_FC: (
@ -1259,7 +1258,6 @@ class TensorNameMap:
            "visual.patch_embed.proj", # qwen2vl
            "vision_tower.patch_embed.proj", # kimi-vl
            "model.vision.patch_embedding.proj", # cogvlm
            "siglip2.vision_model.embeddings.patch_embedding",
        ),
        MODEL_TENSOR.V_ENC_EMBD_NORM: (
@ -1293,7 +1291,6 @@ class TensorNameMap:
            "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
            "visual.blocks.{bid}.attn.q", # qwen2vl, generated
            "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
        ),
        MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@ -1311,7 +1308,6 @@ class TensorNameMap:
            "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
            "visual.blocks.{bid}.attn.k", # qwen2vl, generated
            "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
        ),
        MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@ -1329,7 +1325,6 @@ class TensorNameMap:
            "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
            "visual.blocks.{bid}.attn.v", # qwen2vl, generated
            "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
        ),
        MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -1344,7 +1339,6 @@ class TensorNameMap:
            "visual.blocks.{bid}.norm1", # qwen2vl
            "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
            "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
            "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
        ),
        MODEL_TENSOR.V_ENC_ATTN_O: (
@ -1360,7 +1354,6 @@ class TensorNameMap:
            "visual.blocks.{bid}.attn.proj", # qwen2vl
            "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
            "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
        ),
        MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@ -1375,7 +1368,6 @@ class TensorNameMap:
            "visual.blocks.{bid}.norm2", # qwen2vl
            "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
            "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
            "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
        ),
        MODEL_TENSOR.V_ENC_FFN_UP: (
@ -1391,7 +1383,6 @@ class TensorNameMap:
            "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
            "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
            "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
            "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
        ),
        MODEL_TENSOR.V_ENC_FFN_GATE: (
@ -1413,7 +1404,6 @@ class TensorNameMap:
            "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
            "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
            "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
            "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
        ),
        MODEL_TENSOR.V_LAYER_SCALE_1: (
@ -1440,7 +1430,6 @@ class TensorNameMap:
            "visual.merger.ln_q", # qwen2vl
            "vision_tower.encoder.final_layernorm", # kimi-vl
            "visual.post_layernorm", # glm4v
            "siglip2.vision_model.post_layernorm",
        ),
        MODEL_TENSOR.V_MM_POST_NORM: (
@ -1457,7 +1446,6 @@ class TensorNameMap:
            "multi_modal_projector.pre_norm",
            "pre_mm_projector_norm",
            "model.vision.linear_proj.norm1", # cogvlm
            "merger.ln_q",
        ),
        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -22,7 +22,6 @@ python = ">=3.8"
 numpy = ">=1.17"
 tqdm = ">=4.27"
 pyyaml = ">=5.1"
 requests = ">=2.25"
 sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
 PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
--- a/include/llama.h
+++ b/include/llama.h
@ -316,11 +316,6 @@ extern "C" {
        bool no_alloc;        // only load metadata and simulate memory allocations
    };
    struct llama_sampler_seq_config {
        llama_seq_id           seq_id;
        struct llama_sampler * sampler;
    };
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggml-org/llama.cpp/pull/7544
    struct llama_context_params {
@ -369,12 +364,6 @@ extern "C" {
        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
        // [EXPERIMENTAL]
        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
        struct llama_sampler_seq_config * samplers;
        size_t                            n_samplers;
    };
    // model quantization parameters
@ -535,7 +524,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@ -1004,32 +992,6 @@ extern "C" {
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
    // backend sampling API [EXPERIMENTAL]
    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
    //
    // Get the backend sampled token for the ith token.
    // Returns LLAMA_TOKEN_NULL if no token was sampled.
    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled probabilites for the ith token
    // The index matches llama_get_sampled_token_ith().
    // Returns NULL if no probabilites were generated.
    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled logits for the ith token
    // Returns NULL if no logits were sampled.
    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
    // Get the backend sampled candidates (token ids) for the ith token
    // These are needed to map probability/logit indices to vocab token ids.
    // Returns NULL if no candidates were sampled.
    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
    //
    // Vocab
    //
@ -1201,16 +1163,11 @@ extern "C" {
    //
    //    llama_sampler_free(smpl);
    //
    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
    //
    typedef void * llama_sampler_context_t;
    struct llama_sampler_data {
        struct ggml_tensor * logits;
        struct ggml_tensor * probs;
        struct ggml_tensor * sampled;
        struct ggml_tensor * candidates;
    };
    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@ -1220,45 +1177,17 @@ extern "C" {
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-        // [EXPERIMENTAL]
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
-        // backend sampling interface:
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
        // return true if the backend supports all ops needed by the sampler
        // note: call once per sampler
        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
        // call after .backend_apply()
        void (*backend_accept)(
                struct llama_sampler * smpl,
                struct ggml_context  * ctx,
                struct ggml_cgraph   * gf,
                struct ggml_tensor   * selected_token);
        // call after .backend_init()
        void (*backend_apply)(
                struct llama_sampler      * smpl,
                struct ggml_context       * ctx,
                struct ggml_cgraph        * gf,
                struct llama_sampler_data * data);
        // called before graph execution to set inputs for the current ubatch
        void (*backend_set_input)(struct llama_sampler * smpl);
    };
    struct llama_sampler {
-        struct llama_sampler_i * iface;
+        const struct llama_sampler_i * iface;
        llama_sampler_context_t        ctx;
    };
    // [EXPERIMENTAL]
    // attach a sampler to the context
    // note: prefer initializing the context with llama_context_params.samplers when possible
    // note: changing the samplers of a context can cause graph reallocations and degraded performance
    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
    // mirror of llama_sampler_i:
-    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@ -1274,15 +1203,7 @@ extern "C" {
    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    // return NULL if:
    //   - the sampler is NULL
    //   - the sampler is not a llama_sampler_chain
    //   - the index is out of bounds, unless i == -1
    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
    // the total number of samplers in the chain
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-ebc3a0f4a56be1c9424a89fbec09962ac34fde85
+130bc125a88bb57664b88932c48c38a1cb316fac
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -87,7 +87,6 @@ add_library(llama
            models/llada.cpp
            models/llama-iswa.cpp
            models/llama.cpp
            models/maincoder.cpp
            models/mamba.cpp
            models/mimo2-iswa.cpp
            models/minicpm3.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -118,7 +118,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_MIMO2,            "mimo2"           },
    { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
    { LLM_ARCH_MAINCODER,        "maincoder"        },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@ -152,7 +151,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
    { LLM_KV_EMBEDDING_LENGTH_OUT,              "%s.embedding_length_out"              },
    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@ -2076,7 +2074,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM_LFM2,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_DENSE_2_OUT,
            };
        case LLM_ARCH_LFM2MOE:
            return {
@ -2237,23 +2234,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
            return {
                LLM_TENSOR_TOKEN_EMBD,
            };
        case LLM_ARCH_MAINCODER:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_Q,
                LLM_TENSOR_ATTN_Q_NORM,
                LLM_TENSOR_ATTN_K,
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE,
                LLM_TENSOR_FFN_DOWN,
                LLM_TENSOR_FFN_UP,
            };
        default:
            GGML_ABORT("unknown architecture for tensor mapping");
    }
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -122,7 +122,6 @@ enum llm_arch {
    LLM_ARCH_MISTRAL3,
    LLM_ARCH_MIMO2,
    LLM_ARCH_LLAMA_EMBED,
    LLM_ARCH_MAINCODER,
    LLM_ARCH_UNKNOWN,
 };
@ -156,7 +155,6 @@ enum llm_kv {
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_EMBEDDING_LENGTH_OUT,
    LLM_KV_FEATURES_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@ -74,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
    { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
 };
 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -217,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
    } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
        return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@ -848,14 +845,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[unused9]助手：";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
        }
        if (add_ass) {
            ss << "<|begin|>assistant";
        }
    } else {
        // template not supported
        return -1;
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@ -54,7 +54,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_PANGU_EMBED,
    LLM_CHAT_TEMPLATE_SOLAR_OPEN,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -60,25 +60,6 @@ llama_context::llama_context(
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
    // Initialize backend samplers here so they are part of the sampling graph
    // before the reserve passes run later in this function. This avoids a later
    // re-reserve when graph nodes change.
    if (params.samplers != nullptr && params.n_samplers > 0) {
        for (size_t i = 0; i < params.n_samplers; ++i) {
            const auto & config = params.samplers[i];
            if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
                throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
            }
            if (set_sampler(config.seq_id, config.sampler)) {
                const int n_samplers = llama_sampler_chain_n(config.sampler);
                LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
            }
        }
    }
    auto rope_scaling_type = params.rope_scaling_type;
    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
        rope_scaling_type = hparams.rope_scaling_type_train;
@ -250,10 +231,7 @@ llama_context::llama_context(
        // graph outputs buffer
        {
            // resized during inference when a batch uses more outputs
-            // Create a dummy batch for initialization.
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
            llama_batch dummy_batch = {};
            dummy_batch.n_tokens = 0;
            if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
                throw std::runtime_error("failed to reserve initial output buffer");
            }
@ -478,16 +456,6 @@ llama_context::llama_context(
            LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
        }
    }
    // Initialize the full vocabulary token ids for backend samplers.
    {
        const int n_vocab = model.vocab.n_tokens();
        sampling.token_ids_full_vocab.resize(n_vocab);
        for (int i = 0; i < n_vocab; ++i) {
            sampling.token_ids_full_vocab[i] = i;
        }
    }
 }
 llama_context::~llama_context() {
@ -648,35 +616,6 @@ float * llama_context::get_logits() {
    return logits;
 }
 int64_t llama_context::output_resolve_row(int32_t i) const {
    int64_t j = -1;
    // support negative indices (last output row)
    if (i < 0) {
        j = n_outputs + i;
        if (j < 0) {
            throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
        }
    } else if ((size_t) i >= output_ids.size()) {
        throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
    } else {
        // use output_ids to translate the batch token index into a row number
        // that holds this token's data.
        j = output_ids[i];
    }
    if (j < 0) {
        // the batch token was not configured to output anything
        throw std::runtime_error(format("batch.logits[%d] != true", i));
    }
    if (j >= n_outputs) {
        throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
    }
    return j;
 }
 float * llama_context::get_logits_ith(int32_t i) {
    int64_t j = -1;
@ -687,7 +626,6 @@ float * llama_context::get_logits_ith(int32_t i) {
            throw std::runtime_error("no logits");
        }
        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@ -724,10 +662,6 @@ float * llama_context::get_embeddings() {
    return embd;
 }
 llama_token * llama_context::get_sampled_tokens()  const{
    return sampling.sampled;
 }
 float * llama_context::get_embeddings_ith(int32_t i) {
    int64_t j = -1;
@ -738,7 +672,6 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error("no embeddings");
        }
        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@ -758,8 +691,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
        }
-        const uint32_t n_embd_out = model.hparams.get_n_embd_out();
+        return embd + j*model.hparams.n_embd;
        return embd + j*n_embd_out;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@ -779,136 +711,6 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
    return it->second.data();
 }
 llama_token llama_context::get_sampled_token_ith(int32_t idx) {
    output_reorder();
    if (sampling.sampled == nullptr) {
        return LLAMA_TOKEN_NULL;
    }
    try {
        const int64_t row = output_resolve_row(idx);
        GGML_ASSERT(row < (int64_t) sampling.sampled_size);
        return sampling.sampled[row];
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
        return LLAMA_TOKEN_NULL;
    }
 }
 float * llama_context::get_sampled_probs_ith(int32_t idx) {
    output_reorder();
    if (sampling.probs == nullptr) {
        return nullptr;
    }
    try {
        const int64_t row = output_resolve_row(idx);
        if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
            return nullptr;
        }
        return sampling.probs + row*model.vocab.n_tokens();
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
        return nullptr;
    }
 }
 float * llama_context::get_sampled_logits_ith(int32_t idx) {
    output_reorder();
    if (sampling.logits == nullptr) {
        return nullptr;
    }
    try {
        const int64_t row = output_resolve_row(idx);
        if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
            return nullptr;
        }
        return sampling.logits + row*model.vocab.n_tokens();
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
        return nullptr;
    }
 }
 const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
    output_reorder();
    try {
        const int64_t row = output_resolve_row(idx);
        if (sampling.candidates != nullptr &&
            (size_t) row < sampling.candidates_count.size() &&
            sampling.candidates_count[row] > 0) {
            return sampling.candidates + row*model.vocab.n_tokens();
        }
    } catch (const std::exception & err) {
        // fallback to full vocab list
    }
    return sampling.token_ids_full_vocab.data();
 }
 size_t llama_context::get_sampled_candidates_count(int32_t idx) {
    output_reorder();
    if (sampling.candidates == nullptr) {
        return 0;
    }
    try {
        const int64_t row = output_resolve_row(idx);
        if ((size_t) row >= sampling.candidates_count.size()) {
            return 0;
        }
        return sampling.candidates_count[row];
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
        return 0;
    }
 }
 size_t llama_context::get_sampled_logits_count(int32_t idx) {
    output_reorder();
    if (sampling.logits == nullptr) {
        return model.vocab.n_tokens();
    }
    try {
        const int64_t row = output_resolve_row(idx);
        if ((size_t) row >= sampling.logits_count.size()) {
            return 0;
        }
        return sampling.logits_count[row];
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
        return 0;
    }
 }
 size_t llama_context::get_sampled_probs_count(int32_t idx) {
    output_reorder();
    if (sampling.probs == nullptr) {
        return 0;
    }
    try {
        const int64_t row = output_resolve_row(idx);
        if ((size_t) row >= sampling.probs_count.size()) {
            return 0;
        }
        return sampling.probs_count[row];
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
        return 0;
    }
 }
 void llama_context::attach_threadpool(
           ggml_threadpool_t threadpool,
           ggml_threadpool_t threadpool_batch) {
@ -965,42 +767,6 @@ void llama_context::set_warmup(bool value) {
    cparams.warmup = value;
 }
 bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
    LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
    const bool can_offload =
        sampler &&
        sampler->iface->backend_init &&
        sampler->iface->backend_apply &&
        llama_sampler_chain_n(sampler) > 0;
    if (sampler && can_offload) {
        ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
        auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
        if (host_buft) {
            buft = host_buft;
        }
        sampler->iface->backend_init(sampler, buft);
        sampling.samplers[seq_id] = sampler;
        return true;
    }
    if (sampler && !can_offload) {
        LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
        sampling.samplers.erase(seq_id);
        return false;
    }
    sampling.samplers.erase(seq_id);
    return true;
 }
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@ -1141,7 +907,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    n_queued_tokens += n_tokens;
    // reserve output buffer
-    if (output_reserve(n_tokens, batch_inp) < n_tokens) {
+    if (output_reserve(n_tokens) < n_tokens) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
        return -2;
    };
@ -1195,10 +961,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
                {
                    // extract token embeddings
                    GGML_ASSERT(embd != nullptr);
                    const uint32_t n_embd_out = hparams.get_n_embd_out();
-                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                } break;
            case LLAMA_POOLING_TYPE_MEAN:
            case LLAMA_POOLING_TYPE_CLS:
@ -1266,112 +1031,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
    return 0;
 }
 static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
    std::map<llama_seq_id, uint32_t> seq_to_row;
    // how many output tokens we have seen so far for this ubatch.
    uint32_t local = 0;
    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
        // skip tokens that are not output.
        if (!ubatch.output[i]) {
            continue;
        }
        const llama_seq_id seq_id = ubatch.seq_id[i][0];
        // row_offset is the number of output tokens before this ubatch.
        seq_to_row[seq_id] = row_offset + local;
        ++local;
    }
    return seq_to_row;
 }
 static void copy_tensor_async_ints(
    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
    llama_token * sampled,
    size_t sampled_size,
    const std::map<llama_seq_id, uint32_t> & seq_to_row,
    ggml_backend_sched_t sched) {
    if (sampled == nullptr) {
        return;
    }
    for (const auto & [seq_id, tensor] : tensor_map) {
        auto it = seq_to_row.find(seq_id);
        if (it == seq_to_row.end()) {
            continue;
        }
        const uint32_t row = it->second;
        GGML_ASSERT(row < sampled_size);
        GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
        ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
    }
 }
 static void copy_tensor_async_floats(
    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
    float * dst,
    size_t stride,
    std::vector<uint32_t> & counts,
    const std::map<llama_seq_id, uint32_t> & seq_to_row,
    ggml_backend_sched_t sched) {
    if (dst == nullptr) {
        return;
    }
    for (const auto & [seq_id, tensor] : tensor_map) {
        auto it = seq_to_row.find(seq_id);
        if (it == seq_to_row.end()) {
            continue;
        }
        const uint32_t row = it->second;
        GGML_ASSERT(row < counts.size());
        GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
        float * row_ptr = dst + (size_t) row * stride;
        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
        // Update the actual number of logits/probabilities that were written for this row.
        counts[row] = ggml_nelements(tensor);
    }
 }
 static void copy_tensor_async_candidates(
    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
    llama_token * dst,
    size_t stride,
    std::vector<uint32_t> & counts,
    const std::map<llama_seq_id, uint32_t> & seq_to_row,
    ggml_backend_sched_t sched) {
    if (dst == nullptr) {
        return;
    }
    for (const auto & [seq_id, tensor] : tensor_map) {
        auto it = seq_to_row.find(seq_id);
        if (it == seq_to_row.end()) {
            continue;
        }
        const uint32_t row = it->second;
        GGML_ASSERT(row < counts.size());
        GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
        llama_token * row_ptr = dst + (size_t) row * stride;
        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
        // Update the actual number of candidates that were written.
        counts[row] = ggml_nelements(tensor);
    }
 }
 int llama_context::decode(const llama_batch & batch_inp) {
    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
@ -1393,35 +1052,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    // when computing embeddings, all tokens are output
    const bool output_all = cparams.embeddings;
    const bool has_samplers = !sampling.samplers.empty();
-    const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
    // TODO: avoid this workaround in the future
    if (has_samplers && batch_inp.logits) {
        std::vector<int32_t> seq_output_count(n_seq_max, 0);
        for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
            if (batch_inp.logits[i] == 0) {
                continue;
            }
            const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
            for (int32_t s = 0; s < ns; ++s) {
                const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
                seq_output_count[seq_id]++;
                if (seq_output_count[seq_id] > 1) {
                    LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
                            __func__, seq_id, seq_output_count[seq_id]);
                    return -1;
                }
            }
        }
    }
    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@ -1502,7 +1134,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    }
    // reserve output buffer
-    if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
        return -2;
    };
@ -1575,10 +1207,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }
        // extract logits
-        // For multi-sequence batches that mix backend samplers and CPU sampler
+        if (t_logits && n_outputs > 0) {
        // this is currently inefficient as we copy all logits even for the
        // backend sampled tokens.
        if (logits && t_logits && n_outputs > 0) {
            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
            GGML_ASSERT(backend_res != nullptr);
            GGML_ASSERT(logits != nullptr);
@ -1593,7 +1222,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }
        // extract embeddings
-        if (embd && t_embd && n_outputs > 0) {
+        if (t_embd && n_outputs > 0) {
            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
            GGML_ASSERT(backend_embd != nullptr);
@ -1602,13 +1231,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    {
                        // extract token embeddings
                        GGML_ASSERT(embd != nullptr);
-                        const uint32_t n_embd_out = hparams.get_n_embd_out();
+                        float * embd_out = embd + n_outputs_prev*n_embd;
                        float * embd_out = embd + n_outputs_prev*n_embd_out;
                        if (n_outputs) {
                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_MEAN:
@ -1648,22 +1276,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
            }
        }
        // This flag indicates whether a backend sampler has actually sampled a specific
        // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
        const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
        if (has_samplers && has_sampled) {
            const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
            const auto stride = n_vocab;
            // async copy the sampling data from the backend to the host
            copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
            copy_tensor_async_floats    (res->t_sampled_logits, sampling.logits,     stride, sampling.logits_count,     seq_to_output_row, sched.get());
            copy_tensor_async_floats    (res->t_sampled_probs,  sampling.probs,      stride, sampling.probs_count,      seq_to_output_row, sched.get());
            copy_tensor_async_candidates(res->t_candidates,     sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
        }
        n_outputs_prev += n_outputs;
    } while (mctx->next());
@ -1727,7 +1339,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 // output
 //
-uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;
@ -1735,7 +1347,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    const auto n_batch = cparams.n_batch;
    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd_out = hparams.get_n_embd_out();
+    const auto n_embd  = hparams.n_embd;
    bool has_logits = true;
    bool has_embd   = cparams.embeddings;
@ -1746,53 +1358,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        has_embd   = true;
    }
-    // Check which sampling modes are needed for the current batch.
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    // TODO: avoid this branching by working with the worst-case
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
    bool has_sampling = false;
    bool cpu_logits   = false;
    if (batch.logits) {
        for (int32_t i = 0; i < batch.n_tokens; i++) {
            if (!batch.logits[i]) {
                continue;
            }
            for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
                llama_seq_id seq_id = batch.seq_id[i][j];
                if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
                    has_sampling = true;
                } else {
                    cpu_logits = true;
                }
            }
        }
    } else {
        // When batch.logits is nullptr (when loading state with a dummy batch),
        // allocate CPU logits.
        cpu_logits = true;
    }
    size_t backend_float_count = 0;
    size_t backend_token_count = 0;
    // Allocate CPU logits buffer only if needed by sequences in this batch
    logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
    embd_size   = has_embd ? n_embd_out*n_outputs_max : 0;
    // TODO: avoid this branching by working with the worst-case
    if (!has_sampling) {
        sampling.logits_size     = 0;
        sampling.probs_size      = 0;
        sampling.sampled_size    = 0;
        sampling.candidates_size = 0;
    } else {
        sampling.logits_size     = n_vocab*n_outputs_max;
        sampling.probs_size      = n_vocab*n_outputs_max;
        sampling.sampled_size    =         n_outputs_max;
        sampling.candidates_size = n_vocab*n_outputs_max;
        backend_float_count = sampling.logits_size  + sampling.probs_size;
        backend_token_count = sampling.sampled_size + sampling.candidates_size;
    }
    if (output_ids.empty()) {
        // init, never resized afterwards
@ -1800,9 +1367,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    }
    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  =
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
        (logits_size + embd_size + backend_float_count) * sizeof(float) +
        (                          backend_token_count) * sizeof(llama_token);
    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
@ -1810,11 +1375,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        if (buf_output) {
 #ifndef NDEBUG
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
            synchronize();
            // TODO: not needed?
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
@ -1836,49 +1399,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
-    logits = nullptr;
+    logits = has_logits ? output_base               : nullptr;
-    embd   = nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
    size_t offset = 0;
    uint8_t * base = (uint8_t *) output_base;
    logits = (has_logits && cpu_logits) ? output_base : nullptr;
    offset += logits_size * sizeof(float);
    embd = has_embd ? (float *) (base + offset) : nullptr;
    offset += embd_size * sizeof(float);
    sampling.logits     = nullptr;
    sampling.probs      = nullptr;
    sampling.sampled    = nullptr;
    sampling.candidates = nullptr;
    if (has_sampling) {
        sampling.logits = (float *) (base + offset);
        offset += sampling.logits_size * sizeof(float);
        sampling.probs = (float *) (base + offset);
        offset += sampling.probs_size * sizeof(float);
        sampling.sampled = (llama_token *) (base + offset);
        offset += sampling.sampled_size * sizeof(llama_token);
        sampling.candidates = (llama_token *) (base + offset);
        offset += sampling.candidates_size * sizeof(llama_token);
        // The count vectors keep track of the actual number of logits/probs/candidates
        // copied from the backend for each output row.
        sampling.logits_count.resize(n_outputs_max);
        sampling.probs_count.resize(n_outputs_max);
        sampling.candidates_count.resize(n_outputs_max);
        std::fill(sampling.logits_count.begin(),     sampling.logits_count.end(),     0);
        std::fill(sampling.probs_count.begin(),      sampling.probs_count.end(),      0);
        std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
        std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
    }
    // set all ids as invalid (negative)
    std::fill(output_ids.begin(), output_ids.end(), -1);
@ -1907,40 +1429,6 @@ void llama_context::output_reorder() {
                std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
            }
        }
        if (sampling.logits && sampling.logits_size > 0) {
            for (uint64_t k = 0; k < n_vocab; ++k) {
                std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
            }
        }
        if (sampling.probs && sampling.probs_size > 0) {
            for (uint64_t k = 0; k < n_vocab; ++k) {
                std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
            }
        }
        if (sampling.candidates && sampling.candidates_size > 0) {
            for (uint64_t k = 0; k < n_vocab; ++k) {
                std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
            }
        }
        if (sampling.sampled && sampling.sampled_size > 0) {
            std::swap(sampling.sampled[i0], sampling.sampled[i1]);
        }
        if (!sampling.logits_count.empty()) {
            std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
        }
        if (!sampling.probs_count.empty()) {
            std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
        }
        if (!sampling.candidates_count.empty()) {
            std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
        }
    }
    output_swaps.clear();
@ -1970,7 +1458,7 @@ ggml_cgraph * llama_context::graph_reserve(
    if (n_tokens % n_seqs != 0) {
        n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
-        n_outputs = std::max(n_outputs, n_tokens);
+        n_outputs = std::min(n_outputs, n_tokens);
        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
    }
@ -1989,15 +1477,6 @@ ggml_cgraph * llama_context::graph_reserve(
    llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
    // set one output token per sequence in order to activate all backend samplers
    std::vector<llama_seq_id> seq_ids(n_seqs);
    for (uint32_t i = 0; i < n_seqs; ++i) {
        seq_ids[i] = i;
        ubatch.n_seq_id[i] = 1;
        ubatch.seq_id[i] = &seq_ids[i];
        ubatch.output[i] = true;
    }
    auto * res = gf_res_reserve.get();
    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
@ -2041,7 +1520,6 @@ llm_graph_params llama_context::graph_params(
        /*.loras       =*/ &loras,
        /*.mctx        =*/ mctx,
        /*.cross       =*/ &cross,
        /*.samplers    =*/ sampling.samplers,
        /*.n_outputs   =*/ n_outputs,
        /*.cb          =*/ graph_get_cb(),
        /*.res         =*/ res,
@ -2497,9 +1975,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
        }
    }
    // TODO: handle sampling buffers and samplers state ?
    //       https://github.com/ggml-org/llama.cpp/pull/17004
    if (memory != nullptr) {
        LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
        memory->state_write(io);
@ -2532,10 +2007,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        auto n_outputs = this->n_outputs;
        io.read_to(&n_outputs, sizeof(n_outputs));
-        // Create a dummy batch for state loading.
+        if (n_outputs > output_reserve(n_outputs)) {
        llama_batch dummy_batch = {};
        dummy_batch.n_tokens = 0;
        if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
            throw std::runtime_error("could not reserve outputs");
        }
@ -2589,9 +2061,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        }
    }
    // TODO: handle sampling buffers and samplers state ?
    //       https://github.com/ggml-org/llama.cpp/pull/17004
    if (memory) {
        LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
@ -2780,7 +2249,7 @@ void llama_context::opt_epoch_iter(
        }
        // reserve output buffer
-        if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
            GGML_ABORT("TODO: handle this error");
        };
@ -2925,8 +2394,6 @@ llama_context_params llama_context_default_params() {
        /*.op_offload                  =*/ true,
        /*.swa_full                    =*/ true,
        /*.kv_unified                  =*/ false,
        /*.sampler                     =*/ nullptr,
        /*.n_sampler                   =*/ 0,
    };
    return result;
@ -3086,15 +2553,7 @@ float * llama_get_logits(llama_context * ctx) {
 float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
-    float * res = nullptr;
+    return ctx->get_logits_ith(i);
    res = ctx->get_sampled_logits_ith(i);
    if (!res) {
        res = ctx->get_logits_ith(i);
    }
    return res;
 }
 float * llama_get_embeddings(llama_context * ctx) {
@ -3115,52 +2574,6 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
    return ctx->get_embeddings_seq(seq_id);
 }
 bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
    return ctx->set_sampler(seq_id, smpl);
 }
 llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return ctx->get_sampled_token_ith(i);
 }
 float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return ctx->get_sampled_probs_ith(i);
 }
 float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return ctx->get_sampled_logits_ith(i);
 }
 llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
 }
 uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
 }
 uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
 }
 uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();
    return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
 }
 // llama adapter API
 int32_t llama_set_adapter_lora(
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -70,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);
    llama_token * get_sampled_tokens() const;
    llama_token   get_sampled_token_ith(int32_t idx);
    float * get_sampled_logits_ith(int32_t idx);
    size_t  get_sampled_logits_count(int32_t idx);
    float * get_sampled_probs_ith(int32_t idx);
    size_t  get_sampled_probs_count(int32_t idx);
    const llama_token * get_sampled_candidates_ith(int32_t idx);
    size_t get_sampled_candidates_count(int32_t idx);
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@ -204,13 +192,10 @@ private:
    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);
    void output_reorder();
    // map the output row index `i` to batch index
    int64_t output_resolve_row(int32_t i) const;
    //
    // graph
    //
@ -228,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@ -269,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;
    // TODO: simplify
    struct sampling_info {
        std::map<llama_seq_id, llama_sampler *> samplers;
        float       * logits      = nullptr;
        size_t        logits_size = 0;
        llama_token * sampled      = nullptr;
        size_t        sampled_size = 0;
        float       * probs        = nullptr;
        size_t        probs_size   = 0;
        llama_token * candidates   = nullptr;
        size_t        candidates_size = 0;
        std::vector<uint32_t> logits_count;
        std::vector<uint32_t> probs_count;
        std::vector<uint32_t> candidates_count;
        std::vector<llama_token> token_ids_full_vocab;
    };
    sampling_info sampling;
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -369,44 +369,6 @@ static void print_rule(
    fprintf(file, "\n");
 }
 //
 // Regex utilities
 //
 size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
    auto find_start_pos = [](const std::smatch & match) {
        // get from the first matched capturing group to the end of the string
        size_t start = std::string::npos;
        for (auto i = 1u; i < match.size(); i++) {
            if (match.length(i) > 0) {
                start = match.position(i);
                break;
            }
        }
        if (start == std::string::npos) {
            start = match.position(0);
        }
        return start;
    };
    if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
        // match against the entire input
        std::smatch match;
        if (std::regex_match(input, match, regex)) {
            return find_start_pos(match);
        }
    }
    // search anywhere
    std::smatch match;
    if (std::regex_search(input, match, regex)) {
        return find_start_pos(match);
    }
    return std::string::npos;
 }
 //
 // implementation
 //
@ -1350,10 +1312,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;
            std::smatch match;
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
-                auto start = trigger_pattern.find(grammar.trigger_buffer);
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                if (start != std::string::npos) {
                    grammar.awaiting_trigger = false;
                    // get from the first matched capturing group to the end of the string
                    size_t start = std::string::npos;
                    for (auto i = 1u; i < match.size(); i++) {
                        if (match.length(i) > 0) {
                            start = match.position(i);
                            break;
                        }
                    }
                    if (start == std::string::npos) {
                        start = match.position(0);
                    }
                    // replay tokens that overlap with [start, end)
                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -119,8 +119,6 @@ struct llama_grammar_parser {
 struct llama_grammar_trigger_pattern {
    std::string pattern;
    std::regex  regex;
    size_t find(const std::string & input) const;
 };
 struct llama_grammar {
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -12,7 +12,6 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
 #include <unordered_set>
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@ -33,7 +32,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;
    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);
    return res;
 }
@ -63,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;
-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;
    return res;
 }
@ -522,43 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }
 void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
    // set the inputs only for the active samplers in the current ubatch
    std::unordered_set<llama_seq_id> active_samplers;
    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
        if (ubatch->output[i]) {
            llama_seq_id seq_id = ubatch->seq_id[i][0];
            active_samplers.insert(seq_id);
        }
    }
    for (auto seq_id : active_samplers) {
        if (samplers.find(seq_id) == samplers.end()) {
            continue;
        }
        auto & sampler = samplers[seq_id];
        if (sampler->iface->backend_set_input) {
            sampler->iface->backend_set_input(sampler);
        }
    }
 }
 bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
    if (samplers.size() != params.samplers.size()) {
        return false;
    }
    for (const auto & [seq_id, sampler] : params.samplers) {
        if (samplers[seq_id] != sampler) {
            return false;
        }
    }
    return true;
 }
 //
 // llm_graph_result
 //
@ -579,10 +541,6 @@ void llm_graph_result::reset() {
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
    t_sampled.clear();
    t_sampled_probs.clear();
    t_sampled_logits.clear();
    t_candidates.clear();
    params = {};
@ -607,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }
 void llm_graph_result::set_outputs() {
    if (t_logits != nullptr) {
        ggml_set_output(t_logits);
    }
    if (t_embd != nullptr) {
        ggml_set_output(t_embd);
    }
    if (t_embd_pooled != nullptr) {
        ggml_set_output(t_embd_pooled);
    }
    for (auto & [seq_id, t] : t_sampled) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_sampled_probs) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_sampled_logits) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
    for (auto & [seq_id, t] : t_candidates) {
        if (t != nullptr) {
            ggml_set_output(t);
        }
    }
 }
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@ -720,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@ -1326,10 +1251,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
    res->add_input(std::move(inp));
    // make sure the produced embeddings are immediately materialized in the ggml graph
    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
    ggml_build_forward_expand(gf, cur);
    return cur;
 }
@ -1913,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
        ggml_set_name(inp->self_kq_mask, "self_kq_mask");
        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }
    {
@ -1929,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }
    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@ -2071,18 +1988,14 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
    if (dense_2) {
    cur = ggml_mul_mat(ctx0, dense_2, cur);
    }
    if (dense_3) {
    cur = ggml_mul_mat(ctx0, dense_3, cur);
    }
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@ -2173,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }
 void llm_graph_context::build_sampling() const {
    if (samplers.empty() || !res->t_logits) {
        return;
    }
    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
    res->add_input(std::move(inp_sampling));
    std::map<llama_seq_id, int32_t> seq_to_logit_row;
    int32_t logit_row_idx = 0;
    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
        if (ubatch.output[i]) {
            llama_seq_id seq_id = ubatch.seq_id[i][0];
            seq_to_logit_row[seq_id] = logit_row_idx;
            logit_row_idx++;
        }
    }
    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
    // add a dummy row of logits
    // this trick makes the graph static, regardless of which samplers are activated
    // this is important in order to minimize graph reallocations
    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
    for (const auto & [seq_id, sampler] : samplers) {
        const auto it = seq_to_logit_row.find(seq_id);
        // inactive samplers always work on the first row
        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
        struct llama_sampler_data data = {
            /*.logits      =*/ logits_seq,
            /*.probs       =*/ nullptr,
            /*.sampled     =*/ nullptr,
            /*.candidates  =*/ nullptr,
        };
        assert(sampler->iface->backend_apply);
        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
        if (data.sampled != nullptr) {
            res->t_sampled[seq_id] = data.sampled;
            ggml_build_forward_expand(gf, data.sampled);
        }
        if (data.probs != nullptr) {
            res->t_sampled_probs[seq_id] = data.probs;
            ggml_build_forward_expand(gf, data.probs);
        }
        if (data.logits != nullptr) {
            res->t_sampled_logits[seq_id] = data.logits;
            ggml_build_forward_expand(gf, data.logits);
        }
        if (data.candidates != nullptr) {
            res->t_candidates[seq_id] = data.candidates;
            ggml_build_forward_expand(gf, data.candidates);
        }
    }
    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
    /*
    for (const auto & [seq_id, sampler] : samplers) {
        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
            ggml_tensor * selected_token = it->second;
            if (selected_token != nullptr) {
                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
            }
        }
    }
    */
 }
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -10,7 +10,6 @@
 #include <memory>
 #include <set>
 #include <functional>
 #include <map>
 struct ggml_cgraph;
 struct ggml_context;
@ -397,18 +396,6 @@ public:
    const llama_memory_hybrid_context * mctx;
 };
 class llm_graph_input_sampling : public llm_graph_input_i {
 public:
    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
        samplers(std::move(samplers)) { }
    virtual ~llm_graph_input_sampling() = default;
    void set_input(const llama_ubatch * ubatch) override;
    bool can_reuse(const llm_graph_params & params) override;
    std::map<llama_seq_id, llama_sampler *> samplers;
 };
 //
 // llm_graph_result
 //
@ -442,23 +429,6 @@ struct llm_graph_params {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;
    std::map<llama_seq_id, llama_sampler *> samplers;
    static bool samplers_equal(
          const std::map<llama_seq_id, llama_sampler *> & lhs,
          const std::map<llama_seq_id, llama_sampler *> & rhs) {
        if (lhs.size() != rhs.size()) {
            return false;
        }
        for (const auto & [seq_id, sampler] : lhs) {
            auto it = rhs.find(seq_id);
            if (it == rhs.end() || it->second != sampler) {
                return false;
            }
        }
        return true;
    }
    uint32_t n_outputs;
    llm_graph_cb cb;
@ -498,28 +468,6 @@ struct llm_graph_params {
            return false;
        }
        if (n_outputs != other.n_outputs) {
            return false;
        }
        if (!samplers_equal(samplers, other.samplers)) {
            return false;
        }
        if (samplers.size() > 0) {
            if (!ubatch.data || !other.ubatch.data) {
                return false;
            }
            // check that the outputs are the same for all samplers
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                if (ubatch.output[i]    != other.ubatch.output[i] ||
                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
                    return false;
                }
            }
        }
        return
            cparams.embeddings  == other.cparams.embeddings  &&
            cparams.causal_attn == other.cparams.causal_attn &&
@ -527,7 +475,8 @@ struct llm_graph_params {
            gtype     == other.gtype &&
            cvec      == other.cvec  &&
            loras     == other.loras &&
-            cross == other.cross;
+            cross     == other.cross &&
            n_outputs == other.n_outputs;
    }
 };
@ -550,7 +499,6 @@ public:
    void reset();
    void set_inputs(const llama_ubatch * ubatch);
    void set_outputs();
    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@ -569,11 +517,6 @@ public:
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
    std::map<llama_seq_id, ggml_tensor*> t_candidates;
    std::map<llama_seq_id, ggml_tensor*> t_sampled;
    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
    std::vector<llm_graph_input_ptr> inputs;
    ggml_context_ptr ctx_compute;
@ -649,8 +592,6 @@ struct llm_graph_context {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;
    std::map<llama_seq_id, llama_sampler *> samplers;
    const llm_graph_cb & cb_func;
    llm_graph_result * res;
@ -891,12 +832,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
    //
    // sampling (backend sampling)
    //
    void build_sampling() const;
    //
    // dense (out)
    //
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }
 uint32_t llama_hparams::get_n_embd_out() const {
    return n_embd_out > 0 ? n_embd_out : n_embd;
 }
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -105,9 +105,9 @@ struct llama_hparams {
    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa  = 10000.0f;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@ -162,9 +162,6 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;
    // output embedding dimension (0 = use n_embd)
    uint32_t n_embd_out = 0;
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@ -237,9 +234,6 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;
    // dimension of output embeddings
    uint32_t get_n_embd_out() const;
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -240,10 +240,9 @@ struct llama_file::impl {
                throw std::runtime_error("unexpectedly reached end of file");
            }
        } else {
-            size_t bytes_read = 0;
+            bool successful = false;
-            while (bytes_read < len) {
+            while (!successful) {
-                const size_t to_read = len - bytes_read;
+                off_t ret = read(fd, ptr, len);
                ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
                if (ret == -1) {
                    if (errno == EINTR) {
@ -252,16 +251,10 @@ struct llama_file::impl {
                    throw std::runtime_error(format("read error: %s", strerror(errno)));
                }
                if (ret == 0) {
                    // EOF: allow if this read was only pulling alignment padding past file end
                    off_t pos = lseek(fd, 0, SEEK_CUR);
                    if (pos != -1 && (size_t) pos == size) {
                        std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
                        return;
                    }
                    throw std::runtime_error("unexpectedly reached end of file");
                }
-                bytes_read += (size_t) ret;
+                successful = true;
            }
        }
    }
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
    if (hparams.n_embd_out > 0) {
        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out);
    }
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -126,7 +126,6 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_31B_A3_5B:     return "31B.A3.5B";
        case LLM_TYPE_80B_A3B:       return "80B.A3B";
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
        case LLM_TYPE_102B_A12B:     return "102B.A12B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
@ -507,7 +506,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out, false);
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
@ -579,7 +577,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
    // TODO: Handle SWA metadata similarly when models start implementing it
    // rope_freq_scale (inverse of the kv) is optional
    float ropescale = 0.0f;
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@ -588,6 +585,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    }
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
    // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
    // non-transformer models do not have attention heads
@ -675,10 +676,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.f_attn_temp_scale       = 0.1f;
                    hparams.f_attn_temp_offset      = 1.0f;
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }
                switch (hparams.n_expert) {
@ -724,10 +721,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@ -1116,14 +1109,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_MAINCODER:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 32: type = LLM_TYPE_1B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_QWEN3VL:
            {
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@ -1249,6 +1234,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (found_swa && hparams.n_swa > 0) {
                    uint32_t swa_period = 8;
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.rope_freq_scale_train_swa = 1.0f;
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
                    hparams.set_swa_pattern(swa_period);
@ -1314,10 +1300,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.n_swa = 4096; // default value of gemma 2
                hparams.set_swa_pattern(2);
                hparams.attn_soft_cap = true;
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
@ -1342,7 +1325,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(6);
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                    hparams.rope_freq_base_train_swa  = 10000.0f;
                    hparams.rope_freq_scale_train_swa = 1.0f;
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@ -1372,9 +1356,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(5);
                hparams.n_layer_kv_from_start     = 20;
                hparams.rope_freq_base_train_swa  = 10000.0f;
                hparams.rope_freq_scale_train_swa = 1.0f;
                hparams.f_attention_scale         = 1.0f;
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1390,8 +1375,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(6);
                hparams.causal_attn = false; // embeddings do not use causal attention
                hparams.rope_freq_base_train_swa = 10000.0f;
                hparams.rope_freq_scale_train_swa = 1.0f;
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@ -1530,10 +1516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(4);
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
@ -1572,10 +1555,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (found_swa && hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@ -1703,7 +1682,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@ -1799,7 +1778,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                switch (hparams.n_layer) {
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
                    default: type = LLM_TYPE_UNKNOWN;
                }
@ -1918,10 +1896,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa = 4096;
                    hparams.set_swa_pattern(4);
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
@ -2224,10 +2198,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(2);
                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                switch (hparams.n_layer) {
                    case 24: type = LLM_TYPE_20B; break;
                    case 36: type = LLM_TYPE_120B; break;
@ -2272,10 +2242,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa         = 4096;
                    hparams.set_swa_pattern(4, true);
                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                    hparams.n_no_rope_layer_step = hparams.n_layer;
@ -3354,14 +3320,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@ -4817,11 +4776,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                    if (!output) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@ -4884,11 +4839,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                    if (!output) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@ -5255,9 +5206,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
@ -6470,9 +6421,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
                        }
                    }
                    // for LFM2-ColBert-350M
                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
                } break;
            case LLM_ARCH_SMALLTHINKER:
                {
@ -6813,37 +6761,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
                    }
                } break;
            case LLM_ARCH_MAINCODER:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                    // if output is NULL, init from the input tok embed
                    if (output == NULL) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                    }
                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -7125,10 +7042,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
            LLAMA_LOG_INFO("%s: freq_base_swa    = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
            LLAMA_LOG_INFO("%s: freq_scale_swa   = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
        }
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
@ -7493,10 +7406,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_llama<true>>(*this, params);
            } break;
        case LLM_ARCH_MAINCODER:
            {
                llm = std::make_unique<llm_build_maincoder>(*this, params);
            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params);
@ -7531,7 +7440,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            } break;
        case LLM_ARCH_MODERN_BERT:
            {
-                llm = std::make_unique<llm_build_modern_bert>(*this, params);
+                llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
            } break;
        case LLM_ARCH_NEO_BERT:
            {
@ -7941,17 +7850,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    // add on pooling layer
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
    // add backend sampling layers (if any)
    llm->build_sampling();
    // if the gguf model was converted with --sentence-transformers-dense-modules
    // there will be two additional dense projection layers
    // dense linear projections are applied after pooling
    // TODO: move reranking logic here and generalize
    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
    llm->res->set_outputs();
    return llm->res->get_gf();
 }
@ -8007,10 +7911,6 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
    return model->hparams.n_embd_inp();
 }
 int32_t llama_model_n_embd_out(const llama_model * model) {
    return model->hparams.get_n_embd_out();
 }
 int32_t llama_model_n_layer(const llama_model * model) {
    return model->hparams.n_layer;
 }
@ -8114,7 +8014,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ERNIE4_5_MOE:
        case LLM_ARCH_MISTRAL3:
        case LLM_ARCH_LLAMA_EMBED:
        case LLM_ARCH_MAINCODER:
            return LLAMA_ROPE_TYPE_NORM;
        // the pairs of head values are offset by n_rot/2
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -119,7 +119,6 @@ enum llm_type {
    LLM_TYPE_31B_A3_5B,
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -14,16 +14,7 @@ struct llama_grammar;
 struct llama_sampler_chain {
    llama_sampler_chain_params params;
-    // has .backend_init() been called?
+    std::vector<struct llama_sampler *> samplers;
    bool is_init = false;
    struct info {
        bool is_backend;
        llama_sampler * ptr;
    };
    std::vector<info> samplers;
    // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
    std::vector<llama_token_data> cur;
@ -41,4 +32,4 @@ struct llama_sampler * llama_sampler_init_dry_testing(
                           float   dry_base,
                         int32_t   dry_allowed_length,
                         int32_t   dry_penalty_last_n,
-        const std::vector<std::vector<llama_token>> & seq_breakers);
+  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
                regex_exprs = {
                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@ -1867,11 +1860,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "deepseek-v3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
                clean_spaces = false;
            } else if (
                    tokenizer_pre == "youtu") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
                clean_spaces = false;
                ignore_merges = true;
            } else if (
                    tokenizer_pre == "falcon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@ -2027,10 +2015,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "minimax-m2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                clean_spaces = false;
            } else if (
                tokenizer_pre == "solar-open") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -2203,8 +2187,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        //       for now, we apply this workaround to find the tokens based on their text
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
            if (special_eot_id == LLAMA_TOKEN_NULL) {
                if (false
@ -2220,10 +2202,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2234,10 +2216,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|eom_id|>"
                        ) {
                    special_eom_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2254,10 +2236,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_prefix|>" // GLM-4.5
                        ) {
                    special_fim_pre_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2274,10 +2256,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_suffix|>" // GLM-4.5
                        ) {
                    special_fim_suf_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2294,10 +2276,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_middle|>" // GLM-4.5
                        ) {
                    special_fim_mid_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2311,10 +2293,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<PAD>"
                        ) {
                    special_fim_pad_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2329,10 +2311,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<reponame>"    // Granite
                        ) {
                    special_fim_rep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@ -2343,41 +2325,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|file_sep|>" // Qwen
                        ) {
                    special_fim_sep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
        }
        // auto-detect unused tokens: e.g. control tokens with the word "unused"
        // ideally, these tokens should be marked as unused during conversion
        {
            uint32_t n_unused = 0;
            for (const auto & t : token_to_id) {
                auto & attr = id_to_token[t.second].attr;
                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    continue;
                }
                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
                    if (strstr(t.first.c_str(), "unused") != NULL) {
                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
                    }
                }
                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
                    n_unused++;
                }
            }
            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
        }
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@ -2396,16 +2352,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        }
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<|return|>" // o200k_harmony
                    || t.first == "<|call|>"   // o200k_harmony
                    || t.first == "<|flush|>"  // solar-open
                    || t.first == "<|calls|>"  // solar-open
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
@ -2415,28 +2367,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.second, t.first.c_str());
-                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            } else {
                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
                // token is control, but not marked as EOG -> print a debug log
-                    if (special_eog_ids.count(t.second) == 0) {
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
                            __func__, t.second, t.first.c_str());
                }
            }
        }
        }
        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
            auto & attr = id_to_token[t.second].attr;
            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
-                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }
@ -2456,42 +2404,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }
-        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
-        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
        //       we remove the "<|end|>" token from the EOG list
        {
            bool has_return = false;
            bool has_call   = false;
            bool has_end    = false;
            bool has_flush  = false;
            llama_token end_id = LLAMA_TOKEN_NULL;
            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
            for (auto tid : special_eog_ids) {
-                auto & text = id_to_token[tid].text;
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
-                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
+                if (id_to_token[tid].text == "<|return|>") {
                if (text == "<|return|>") {
                    has_return = true;
-                } else if (text == "<|call|>" || text == "<|calls|>") {
+                } else if (id_to_token[tid].text == "<|call|>") {
                    has_call = true;
-                } else if (text == "<|flush|>") {
+                } else if (id_to_token[tid].text == "<|end|>") {
                    has_flush = true;
                } else if (text == "<|end|>") {
                    has_end = true;
                    end_id = tid;
                }
            }
-            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
-
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
-                auto & attr = id_to_token[end_id].attr;
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
    }
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -51,8 +51,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
    LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
 };
 struct LLM_KV;
--- a/src/models/afmoe.cpp
+++ b/src/models/afmoe.cpp
@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;
        // dual attention normalization (pre)
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);
            // RoPE only for sliding_attention layers
            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur_rope", il);
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur_rope", il);
            }
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
            cb(cur, "ffn_out", il);
        } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
            const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
            auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            cur = build_ffn(cur,
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@ -3,14 +3,12 @@
 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);
-    ggml_tensor * inpL;
+    ggml_tensor *inpL, *cur;
    ggml_tensor * cur;
    inpL = build_inp_embd(model.tok_embd);
    ggml_tensor * inp_pos = build_inp_pos();
--- a/src/models/cohere2-iswa.cpp
+++ b/src/models/cohere2-iswa.cpp
@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.is_swa(il);
        // UNUSED:
        // const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@ -1,5 +1,7 @@
 #include "models.h"
 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);
    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/src/models/gemma2-iswa.cpp
+++ b/src/models/gemma2-iswa.cpp
@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);
            cb(Qcur, "Qcur", il);
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);
-
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@ -1,5 +1,7 @@
 #include "models.h"
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
    inpL = build_inp_embd(model.tok_embd);
    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    if (ubatch.token) {
        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);
-
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/src/models/llama-iswa.cpp
+++ b/src/models/llama-iswa.cpp
@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;
@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
            } else if (inp_attn_scale) {
--- a/src/models/maincoder.cpp
+++ b/src/models/maincoder.cpp
@ -1,117 +0,0 @@
 #include "models.h"
 llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    inpL = build_inp_embd(model.tok_embd);
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
    auto * inp_attn = build_attn_inp_kv();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);
        // self-attention
        {
            // compute Q and K and RoPE them
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
            cb(Qcur, "Qcur_normed", il);
            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
            cb(Kcur, "Kcur_normed", il);
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            cur = build_attn(inp_attn,
                    model.layers[il].wo, model.layers[il].bo,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
        // feed-forward network
        cur = build_norm(ffn_inp,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);
        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                model.layers[il].ffn_gate, NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_SILU, LLM_FFN_PAR, il);
        cb(cur, "ffn_out", il);
        cur = ggml_add(ctx0, cur, ffn_inp);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    // lm_head
    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/src/models/models.h
+++ b/src/models/models.h
@ -312,10 +312,6 @@ struct llm_build_llama_iswa : public llm_graph_context {
    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_maincoder : public llm_graph_context {
    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mamba : public llm_graph_context_mamba {
    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
 };
@ -336,6 +332,7 @@ struct llm_build_mistral3 : public llm_graph_context {
    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };
 template <bool iswa>
 struct llm_build_modern_bert : public llm_graph_context {
    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
 };
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@ -1,6 +1,7 @@
 #include "models.h"
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+template <bool iswa>
 llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@ -23,8 +24,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
    auto * inp_attn = build_attn_inp_no_cache();
    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
+        float freq_base_l  = 0.0f;
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
        if constexpr (iswa) {
            freq_base_l = model.get_rope_freq_base(cparams, il);
        } else {
            freq_base_l = freq_base;
        }
        cur = inpL;
@ -49,13 +55,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
        // RoPE
        Qcur = ggml_rope_ext(
                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        Kcur = ggml_rope_ext(
                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
@ -114,3 +120,7 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
    res->t_embd = cur;
    ggml_build_forward_expand(gf, cur);
 }
 // Explicit template instantiations
 template struct llm_build_modern_bert<false>;
 template struct llm_build_modern_bert<true>;
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe-iswa.cpp
@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
        ggml_tensor * inpSA = inpL;
        // norm
@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`#include "common.cuh"`

			`void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`
`@ -1 +1 @@`
	`ebc3a0f4a56be1c9424a89fbec09962ac34fde85`	`130bc125a88bb57664b88932c48c38a1cb316fac`