Merge branch 'master' into HEAD

2025-12-14 10:19:58 +02:00 · 2025-12-14 10:19:58 +02:00 · 22c7f85b9c
parent 07b809bbc0 254098a279
commit 22c7f85b9c
107 changed files with 4419 additions and 2838 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@ -4,7 +4,7 @@
 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
 # ==============================================================================
 # BUILD STAGE
@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light
 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
 elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "  --run (-r): Run a model (chat) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 WORKDIR /app
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -20,7 +20,8 @@ on:
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
-      '**/*.comp'
+      '**/*.comp',
      '**/*.glsl'
    ]
  pull_request:
@ -40,7 +41,8 @@ on:
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
-      '**/*.comp'
+      '**/*.comp',
      '**/*.glsl'
    ]
 concurrency:
@ -1400,25 +1402,54 @@ jobs:
        chip_type: ['910b', '310p']
        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
-      - name: Dependencies
+      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          tool-cache: true
      - name: Set container image
        id: cann-image
        run: |
-          yum update -y
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
+          echo "image=${image}" >> "${GITHUB_OUTPUT}"
      - name: Pull container image
        run: docker pull "${{ steps.cann-image.outputs.image }}"
      - name: Build
        env:
          BUILD_TYPE: ${{ matrix.build }}
          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+          HOST_UID=$(id -u)
          HOST_GID=$(id -g)
-          cmake -S . -B build \
+          docker run --rm \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+            -v "${PWD}:/workspace" \
-              -DGGML_CANN=on \
+            -w /workspace \
-              -DSOC_TYPE=ascend${{ matrix.chip_type }}
+            -e SOC_TYPE=${SOC_TYPE} \
-          cmake --build build -j $(nproc)
+            -e BUILD_TYPE=${BUILD_TYPE} \
            "${{ steps.cann-image.outputs.image }}" \
            bash -lc '
              set -e
              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
              yum clean all && rm -rf /var/cache/yum
              git config --global --add safe.directory "/workspace"
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
              cmake -S . -B build \
                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                  -DGGML_CANN=on \
                  -DSOC_TYPE=${SOC_TYPE}
              cmake --build build -j $(nproc)
              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
            '
 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -731,6 +731,78 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
  openEuler-cann:
    strategy:
      matrix:
        arch: [x86, aarch64]
        chip_type: ['910b', '310p']
        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          tool-cache: true
      - name: Set container image
        id: cann-image
        run: |
          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"
      - name: Pull container image
        run: docker pull "${{ steps.cann-image.outputs.image }}"
      - name: Build
        env:
          BUILD_TYPE: ${{ matrix.build }}
          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
          HOST_UID=$(id -u)
          HOST_GID=$(id -g)
          docker run --rm \
            -v "${PWD}:/workspace" \
            -w /workspace \
            -e SOC_TYPE=${SOC_TYPE} \
            -e BUILD_TYPE=${BUILD_TYPE} \
            "${{ steps.cann-image.outputs.image }}" \
            bash -lc '
              set -e
              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
              yum clean all && rm -rf /var/cache/yum
              git config --global --add safe.directory "/workspace"
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
              cmake -S . -B build \
                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                  -DGGML_CANN=on \
                  -DSOC_TYPE=${SOC_TYPE}
              cmake --build build -j $(nproc)
              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
            '
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        run: |
          cp LICENSE ./build/bin/
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
      - name: Upload artifacts (tar)
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -752,6 +824,7 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
      - openEuler-cann
    steps:
      - name: Clone
@ -844,6 +917,12 @@ jobs:
            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
            **openEuler:**
            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
      - name: Upload release
        id: upload_release
        uses: actions/github-script@v3
--- a/.gitignore
+++ b/.gitignore
@ -54,6 +54,7 @@
 /out/
 /tmp/
 /autogen-*.md
 /common/build-info.cpp
 # Deprecated
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
 bool common_arg::get_value_from_env(std::string & output) const {
    if (env == nullptr) return false;
    if (!args_neg.empty()) {
        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
        std::string neg_env = env;
        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
        char * neg_value = std::getenv(neg_env.c_str());
        if (neg_value) {
            output = "0"; // falsey
            return true;
        }
    }
    char * value = std::getenv(env);
    if (value) {
        output = value;
@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
 }
 bool common_arg::has_value_from_env() const {
    if (env != nullptr && !args_neg.empty()) {
        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
        std::string neg_env = env;
        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
        if (std::getenv(neg_env.c_str())) {
            return true;
        }
    }
    return env != nullptr && std::getenv(env);
 }
@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
    std::string leading_spaces(n_leading_spaces, ' ');
    std::ostringstream ss;
-    for (const auto arg : args) {
+    auto all_args = get_args(); // also contains args_neg
-        if (arg == args.front()) {
+    for (const auto & arg : all_args) {
-            if (args.size() == 1) {
+        if (arg == all_args.front()) {
            if (all_args.size() == 1) {
                ss << arg;
            } else {
                // first arg is usually abbreviation, we need padding to make it more beautiful
@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
                ss << tmp << spaces;
            }
        } else {
-            ss << arg << (arg != args.back() ? ", " : "");
+            ss << arg << (arg != all_args.back() ? ", " : "");
        }
    }
    if (value_hint) ss << " " << value_hint;
@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
    return ss.str();
 }
 std::vector<std::string> common_arg::get_args() const {
    std::vector<std::string> result;
    for (const auto & arg : args) {
        result.push_back(std::string(arg));
    }
    for (const auto & arg : args_neg) {
        result.push_back(std::string(arg));
    }
    return result;
 }
 std::vector<std::string> common_arg::get_env() const {
    std::vector<std::string> result;
    if (env) {
        result.push_back(std::string(env));
    }
    if (!args_neg.empty() && env) {
        // for compatibility, we need to add LLAMA_ARG_NO_ variant
        std::string neg_env = env;
        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
        result.push_back(neg_env);
    }
    return result;
 }
 //
 // utils
 //
@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
    return msg.str();
 }
 static bool parse_bool_value(const std::string & value) {
    if (is_truthy(value)) {
        return true;
    } else if (is_falsey(value)) {
        return false;
    } else {
        throw std::invalid_argument("invalid boolean value");
    }
 }
 //
 // CLI argument parsing functions
 //
@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;
-    std::unordered_map<std::string, common_arg *> arg_to_options;
+    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
    for (auto & opt : ctx_arg.options) {
        for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
+            arg_to_options[arg] = {&opt, /* is_positive */ true};
        }
        for (const auto & arg : opt.args_neg) {
            arg_to_options[arg] = {&opt, /* is_positive */ false};
        }
    }
@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        std::string value;
        if (opt.get_value_from_env(value)) {
            try {
-                if (opt.handler_void && (value == "1" || value == "true")) {
+                if (opt.handler_void && is_truthy(value)) {
                    opt.handler_void(params);
                }
                if (opt.handler_int) {
                    opt.handler_int(params, std::stoi(value));
                }
                if (opt.handler_bool) {
                    opt.handler_bool(params, parse_bool_value(value));
                }
                if (opt.handler_string) {
                    opt.handler_string(params, value);
                    continue;
@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        if (arg_to_options.find(arg) == arg_to_options.end()) {
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
-        auto opt = *arg_to_options[arg];
+        auto & tmp = arg_to_options[arg];
        auto opt = *tmp.first;
        bool is_positive = tmp.second;
        if (opt.has_value_from_env()) {
            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
        }
@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                opt.handler_void(params);
                continue;
            }
            if (opt.handler_bool) {
                opt.handler_bool(params, is_positive);
                continue;
            }
            // arg with single value
            check_arg(i);
@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
            throw std::invalid_argument(string_format(
                "error while handling argument \"%s\": %s\n\n"
                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+                arg.c_str(), e.what(), opt.to_string().c_str()));
        }
    }
@ -438,7 +504,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // model is required (except for server)
    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
        throw std::invalid_argument("error: --model is required\n");
    }
@ -573,6 +639,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-batched-bench",
        "llama-bench",
        "llama-cli",
        "llama-completion",
        "llama-convert-llama2c-to-ggml",
        "llama-cvector-generator",
        "llama-embedding",
@ -657,7 +724,7 @@ static void add_rpc_devices(const std::string & servers) {
    }
 }
-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
    common_params dummy_params;
    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
@ -666,6 +733,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
        for (const auto & arg : opt.args) {
            arg_to_options[arg] = &opt;
        }
        for (const auto & arg : opt.args_neg) {
            arg_to_options[arg] = &opt;
        }
    }
    // TODO @ngxson : find a way to deduplicate this code
@ -750,11 +820,11 @@ static std::string list_builtin_chat_templates() {
 }
 bool common_arg_utils::is_truthy(const std::string & value) {
-    return value == "on" || value == "enabled" || value == "1";
+    return value == "on" || value == "enabled" || value == "true" || value == "1";
 }
 bool common_arg_utils::is_falsey(const std::string & value) {
-    return value == "off" || value == "disabled" || value == "0";
+    return value == "off" || value == "disabled" || value == "false" || value == "0";
 }
 bool common_arg_utils::is_autoy(const std::string & value) {
@ -839,10 +909,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
        {"--display-prompt"},
        {"--no-display-prompt"},
-        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.display_prompt = false;
+            params.display_prompt = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
@ -1055,18 +1126,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.kv_unified = true;
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
        [](common_params & params) {
            params.ctx_shift = false;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+        {"--no-context-shift"},
-        [](common_params & params) {
+        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
-            params.ctx_shift = true;
+        [](common_params & params, bool value) {
            params.ctx_shift = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
    add_opt(common_arg(
@ -1106,20 +1171,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
    add_opt(common_arg(
        {"--perf"},
        {"--no-perf"},
-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.no_perf = true;
+            params.no_perf = !value;
-            params.sampling.no_perf = true;
+            params.sampling.no_perf = !value;
        }
-    ).set_env("LLAMA_ARG_NO_PERF"));
+    ).set_env("LLAMA_ARG_PERF"));
    add_opt(common_arg(
        {"--show-timings"},
        {"--no-show-timings"},
-        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.show_timings = false;
+            params.show_timings = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
    add_opt(common_arg(
        {"-f", "--file"}, "FNAME",
        "a file containing the prompt (default: none)",
@ -1171,16 +1238,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-e", "--escape"},
        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
        [](common_params & params) {
            params.escape = true;
        }
    ));
    add_opt(common_arg(
        {"--no-escape"},
-        "do not process escape sequences",
+        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.escape = false;
+            params.escape = value;
        }
    ));
    add_opt(common_arg(
@ -1227,19 +1288,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cnv", "--conversation"},
-        "run in conversation mode:\n"
+        {"-no-cnv", "--no-conversation"},
        "whether to run in conversation mode:\n"
        "- does not print special tokens and suffix/prefix\n"
        "- interactive mode is also enabled\n"
        "(default: auto enabled if chat template is available)",
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
    add_opt(common_arg(
        {"-no-cnv", "--no-conversation"},
        "force disable conversation mode (default: false)",
        [](common_params & params) {
            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
@ -1297,10 +1352,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
    add_opt(common_arg(
        {"--warmup"},
        {"--no-warmup"},
-        "skip warming up the model with an empty run",
+        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.warmup = false;
+            params.warmup = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
@ -1709,19 +1765,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
    add_opt(common_arg(
        {"-kvo", "--kv-offload"},
        {"-nkvo", "--no-kv-offload"},
-        "disable KV offload",
+        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.no_kv_offload = true;
+            params.no_kv_offload = !value;
        }
-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
    add_opt(common_arg(
        {"--repack"},
        {"-nr", "--no-repack"},
-        "disable weight repacking",
+        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.no_extra_bufts = true;
+            params.no_extra_bufts = !value;
        }
-    ).set_env("LLAMA_ARG_NO_REPACK"));
+    ).set_env("LLAMA_ARG_REPACK"));
    add_opt(common_arg(
        {"--no-host"},
        "bypass host buffer allowing extra buffers to be used",
@ -1850,20 +1908,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"-cb", "--cont-batching"},
-        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        {"-nocb", "--no-cont-batching"},
-        [](common_params & params) {
+        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-            params.cont_batching = true;
+        [](common_params & params, bool value) {
            params.cont_batching = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
    add_opt(common_arg(
-        {"-nocb", "--no-cont-batching"},
+        {"-mm", "--mmproj"}, "FILE",
        "disable continuous batching",
        [](common_params & params) {
            params.cont_batching = false;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
        "path to a multimodal projector file. see tools/mtmd/README.md\n"
        "note: if -hf is used, this argument can be omitted",
        [](common_params & params, const std::string & value) {
@ -1871,26 +1923,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
    add_opt(common_arg(
-        {"--mmproj-url"}, "URL",
+        {"-mmu", "--mmproj-url"}, "URL",
        "URL to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
    add_opt(common_arg(
-        {"--no-mmproj"},
+        {"--mmproj-auto"},
-        "explicitly disable multimodal projector, useful when using -hf",
+        {"--no-mmproj", "--no-mmproj-auto"},
-        [](common_params & params) {
+        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
-            params.no_mmproj = true;
+        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
-        "do not offload multimodal projector to GPU",
+        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.mmproj_use_gpu = false;
+            params.mmproj_use_gpu = value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
        {"--image", "--audio"}, "FILE",
        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@ -1930,12 +1984,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_MLOCK"));
    add_opt(common_arg(
        {"--mmap"},
        {"--no-mmap"},
-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.use_mmap = false;
+            params.use_mmap = value;
        }
-    ).set_env("LLAMA_ARG_NO_MMAP"));
+    ).set_env("LLAMA_ARG_MMAP"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@ -2123,10 +2178,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
        {"--op-offload"},
        {"--no-op-offload"},
-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
+        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.no_op_offload = true;
+            params.no_op_offload = !value;
        }
    ));
    add_opt(common_arg(
@ -2322,10 +2378,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--ppl"},
        {"--no-ppl"},
-        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.compute_ppl = false;
+            params.compute_ppl = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
@ -2444,12 +2501,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.webui = false;
+            params.webui = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2554,18 +2612,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
    add_opt(common_arg(
        {"--slots"},
-        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        {"--no-slots"},
-        [](common_params & params) {
+        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-            params.endpoint_slots = true;
+        [](common_params & params, bool value) {
            params.endpoint_slots = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--no-slots"},
        "disables slots monitoring endpoint",
        [](common_params & params) {
            params.endpoint_slots = false;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--slot-save-path"}, "PATH",
        "path to save slot kv cache (default: disabled)",
@ -2616,26 +2668,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
    add_opt(common_arg(
        {"--models-autoload"},
        {"--no-models-autoload"},
-        "disables automatic loading of models (default: enabled)",
+        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.models_autoload = false;
+            params.models_autoload = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
    add_opt(common_arg(
        {"--jinja"},
-        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+        {"--no-jinja"},
-        [](common_params & params) {
+        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
-            params.use_jinja = true;
+        [](common_params & params, bool value) {
            params.use_jinja = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
    add_opt(common_arg(
        {"--no-jinja"},
        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
        [](common_params & params) {
            params.use_jinja = false;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
    add_opt(common_arg(
        {"--reasoning-format"}, "FORMAT",
        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2680,15 +2727,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
        {"--prefill-assistant"},
        {"--no-prefill-assistant"},
        string_format(
            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
        ),
-        [](common_params & params) {
+        [](common_params & params, bool value) {
-            params.prefill_assistant = false;
+            params.prefill_assistant = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
--- a/common/arg.h
+++ b/common/arg.h
@ -16,6 +16,7 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
@ -25,6 +26,7 @@ struct common_arg {
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
    void (*handler_bool)   (common_params & params, bool) = nullptr;
    common_arg() = default;
@ -48,6 +50,13 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}
    common_arg(
        const std::initializer_list<const char *> & args,
        const std::initializer_list<const char *> & args_neg,
        const std::string & help,
        void (*handler)(common_params & params, bool)
    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@ -80,6 +89,10 @@ struct common_arg {
        }
        return strcmp(args[0], other.args[0]) == 0;
    }
    // get all args and env vars (including negated args/env)
    std::vector<std::string> get_args() const;
    std::vector<std::string> get_env() const;
 };
 namespace common_arg_utils {
@ -102,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 // parse input arguments from CLI into a map
 // TODO: support repeated args in the future
-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/download.cpp
+++ b/common/download.cpp
@ -12,6 +12,8 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
 #include <map>
 #include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 #elif defined(LLAMA_USE_HTTPLIB)
-static bool is_output_a_tty() {
+class ProgressBar {
    static inline std::mutex mutex;
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
            max_line = 0;
        }
    }
    static bool is_output_a_tty() {
 #if defined(_WIN32)
-    return _isatty(_fileno(stdout));
+        return _isatty(_fileno(stdout));
 #else
-    return isatty(1);
+        return isatty(1);
 #endif
 }
 static void print_progress(size_t current, size_t total) {
    if (!is_output_a_tty()) {
        return;
    }
-    if (!total) {
+public:
-        return;
+    ProgressBar() = default;
    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
        cleanup(this);
    }
-    size_t width = 50;
+    void update(size_t current, size_t total) {
-    size_t pct = (100 * current) / total;
+        if (!is_output_a_tty()) {
-    size_t pos = (width * current) / total;
+            return;
        }
-    std::cout << "["
+        if (!total) {
-              << std::string(pos, '=')
+            return;
-              << (pos < width ? ">" : "")
+        }
-              << std::string(width - pos, ' ')
+
-              << "] " << std::setw(3) << pct << "%  ("
+        std::lock_guard<std::mutex> lock(mutex);
-              << current / (1024 * 1024) << " MB / "
+
-              << total / (1024 * 1024) << " MB)\r";
+        if (lines.find(this) == lines.end()) {
-    std::cout.flush();
+            lines[this] = max_line++;
-}
+            std::cout << "\n";
        }
        int lines_up = max_line - lines[this];
        size_t width = 50;
        size_t pct = (100 * current) / total;
        size_t pos = (width * current) / total;
        std::cout << "\033[s";
        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
        std::cout << "\033[2K\r["
            << std::string(pos, '=')
            << (pos < width ? ">" : "")
            << std::string(width - pos, ' ')
            << "] " << std::setw(3) << pct << "%  ("
            << current / (1024 * 1024) << " MB / "
            << total / (1024 * 1024) << " MB) "
            << "\033[u";
        std::cout.flush();
        if (current == total) {
             cleanup(this);
        }
    }
    ProgressBar(const ProgressBar &) = delete;
    ProgressBar & operator=(const ProgressBar &) = delete;
 };
 static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
    ProgressBar bar;
    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
            progress_step += len;
            if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                print_progress(downloaded, total_size);
+                bar.update(downloaded, total_size);
                progress_step = 0;
            }
            return true;
@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
        nullptr
    );
    std::cout << "\n";
    if (!res) {
        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
--- a/common/preset.cpp
+++ b/common/preset.cpp
@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
            // flag option, no value
            if (common_arg_utils::is_falsey(value)) {
-                // skip the flag
+                // use negative arg if available
-                args.pop_back();
+                if (!opt.args_neg.empty()) {
                    args.back() = opt.args_neg.back();
                } else {
                    // otherwise, skip the flag
                    // TODO: maybe throw an error instead?
                    args.pop_back();
                }
            }
        }
        if (opt.value_hint != nullptr) {
@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
 static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
    std::map<std::string, common_arg> mapping;
    for (const auto & opt : ctx_params.options) {
-        if (opt.env != nullptr) {
+        for (const auto & env : opt.get_env()) {
-            mapping[opt.env] = opt;
+            mapping[env] = opt;
        }
-        for (const auto & arg : opt.args) {
+        for (const auto & arg : opt.get_args()) {
            mapping[rm_leading_dashes(arg)] = opt;
        }
    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -7286,6 +7286,10 @@ class DeepseekV2Model(TextModel):
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
            # ref https://github.com/ggml-org/llama.cpp/pull/17945
            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
    _experts: list[dict[str, Tensor]] | None = None
@ -10041,6 +10045,10 @@ class MistralMoeModel(DeepseekV2Model):
        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
        yarn_params = self.hparams["yarn"]
        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
        # ref https://github.com/ggml-org/llama.cpp/pull/17945
        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
    file << "| `";
    // args
-    for (const auto & arg : opt.args) {
+    auto all_args = opt.get_args();
-    if (arg == opt.args.front()) {
+    for (const auto & arg : all_args) {
    if (arg == all_args.front()) {
            file << arg;
-            if (opt.args.size() > 1) file << ", ";
+            if (all_args.size() > 1) file << ", ";
        } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
+            file << arg << (arg != all_args.back() ? ", " : "");
        }
    }
    // value hint
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -1,10 +1,13 @@
 #!/usr/bin/env python3
 import numpy as np
 import sys
-import os
+import numpy as np
 from pathlib import Path
 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
 from common import get_model_name_from_env_path  # type: ignore[import-not-found]
 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
@ -32,27 +35,16 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")
    if max_diff > 1.0:
        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
        return False
    return True
 def main():
-    model_path = os.getenv('MODEL_PATH')
+    model_name = get_model_name_from_env_path('MODEL_PATH')
    if not model_path:
        print("Error: MODEL_PATH environment variable not set")
        sys.exit(1)
    if not os.path.exists(model_path):
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)
    model_name = os.path.basename(model_path)
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
+
    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
    print(f"Using converted model: {llamacpp_model_name}")
    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
    if not pytorch_file.exists():
        print(f"Error: PyTorch logits file not found: {pytorch_file}")
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@ -200,7 +200,7 @@ with torch.no_grad():
    logits = outputs.logits
    # Extract logits for the last token (next token prediction)
-    last_logits = logits[0, -1, :].cpu().numpy()
+    last_logits = logits[0, -1, :].float().cpu().numpy()
    print(f"Logits shape: {logits.shape}")
    print(f"Last token logits shape: {last_logits.shape}")
--- a/examples/model-conversion/scripts/utils/init.py
+++ b/examples/model-conversion/scripts/utils/init.py
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@ -5,6 +5,7 @@ import sys
 import os
 import argparse
 from pathlib import Path
 from common import get_model_name_from_env_path  # type: ignore[import-not-found]
 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
@ -67,11 +68,13 @@ def main():
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()
-    model_name = os.path.basename(args.model_path)
+    model_name = get_model_name_from_env_path('MODEL_PATH')
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
+
    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
    print(f"Model name: {model_name}")
    print(f"PyTorch logits file: {pytorch_file}")
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@ -0,0 +1,20 @@
 #!/usr/bin/env python3
 import os
 import sys
 def get_model_name_from_env_path(env_path_name):
    model_path = os.getenv(env_path_name)
    if not model_path:
        print(f"Error: {env_path_name} environment variable not set")
        sys.exit(1)
    if not os.path.exists(model_path):
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)
    name = os.path.basename(os.path.normpath(model_path))
    if name.endswith(".gguf"):
        name = name[:-5]
    return name
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
    LOG_INF("target:\n\n");
    common_perf_print(ctx_tgt, smpl);
    llama_batch_free(batch_tgt);
    common_sampler_free(smpl);
    common_speculative_free(spec);
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    # TODO
 else()
    set(GGML_STANDALONE OFF)
    if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
    endif()
 endif()
 if (EMSCRIPTEN)
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -99,6 +99,7 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
            return true;
        case GGML_OP_PAD:
            // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
            return ggml_get_op_params_i32(op, 8) == 0;
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@ -24,6 +24,7 @@
 #define UNUSED GGML_UNUSED
 #if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
 static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
                                             int16x8_t *     out_mins,
                                             int8_t *        out_scales) {
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
    scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
    memcpy(out_scales, scales_u32, 8);
 }
 #endif
 void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif
 #if defined(__riscv)
 struct ggml_riscv_arch_features_type {
    int rvv_vlen;
 } ggml_riscv_arch_features = { 0 };
 #endif
 #if defined(_WIN32)
@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH
 #if defined(__riscv) && defined(__riscv_v_intrinsic)
 #include <riscv_vector.h>
 static void ggml_init_riscv_arch_features(void) {
    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
 }
 #else
 static void ggml_init_riscv_arch_features(void) {}
 #endif
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    GGML_ASSERT(!ggml_get_no_alloc(ctx));
@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }
 int ggml_cpu_get_rvv_vlen(void) {
 #if defined(__riscv) && defined(__riscv_v_intrinsic)
    return ggml_riscv_arch_features.rvv_vlen;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
    return 1;
@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
        ggml_init_arm_arch_features();
 #endif
 #if defined(__riscv)
        ggml_init_riscv_arch_features();
 #endif
        is_first_call = false;
    }
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_riscv_v()) {
            features.push_back({ "RISCV_V", "1" });
        }
        if (ggml_cpu_get_rvv_vlen() > 0) {
            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
        }
        if (ggml_cpu_has_vsx()) {
            features.push_back({ "VSX", "1" });
        }
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
    if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
            if (cur->ne[1] % 8 == 0) {
                return &q4_0_8x8_q8_0;
            }
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -67,19 +67,22 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 #define GGML_CUDA_CC_RDNA3_5    (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
 #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
-#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_AMD(cc)     (cc >= GGML_CUDA_CC_OFFSET_AMD)
-#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA(cc)    (cc >= GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA1(cc)   (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
-#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA2(cc)   (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
-#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_RDNA3(cc)   (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
-#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA4(cc)   (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_GCN(cc)     (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
-#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
+#define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
 #define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
 #define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
    const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
    const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;
-    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0      = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
    int bidx = bidx0 - 1;
    int kbc_stop = kbc0;
    while(true) {
-        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+        const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
            kbc_stop = kbc;
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
    const int iter_j = (ne01.z + (ncols1    - 1)) / ncols1;
    // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    int       kbc      = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
        const float2 * Q_f2   = (const float2 *) (Q + nb03*sequence + nb02* head0);
        const half2  * K_h2   = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
        const half   * mask_h = ncols2 == 1 && !mask ? nullptr :
-            (const half  *) (mask + nb33*(sequence % ne33));
+            (const half *) (mask + nb33*(sequence % ne33));
        float2       * dstk   = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4644,9 +4644,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CUMSUM:
        case GGML_OP_TRI:
        case GGML_OP_DIAG:
            return true;
        case GGML_OP_SOLVE_TRI:
-            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
+            return true;
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
                return 8 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                return 2 * l + (threadIdx.x / 16);
 #else
                NO_DEVICE_CODE;
                return -1;
 #endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-
+#if defined(RDNA3)
        // RDNA3 has duplicated data as input.
        static constexpr int ne = I * J / 32 * 2;
 #else
        static constexpr int ne = I * J / 32;
 #endif // defined(RDNA3)
        half2 x[ne] = {{0.0f, 0.0f}};
        static constexpr __device__ bool supported() {
@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
 #if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                return l;
 #else
                NO_DEVICE_CODE;
                return -1;
 #endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
        static constexpr int         ne = I * J / WARP_SIZE;
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 #if defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA3)
        // RDNA3 has duplicated data as input.
        static constexpr int ne = I * J / 32 * 2;
 #else
        static constexpr int ne = I * J / 32;
 #endif // defined(RDNA3)
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 8) return true;
            return false;
@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
 #if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                return l;
 #else
                NO_DEVICE_CODE;
                return -1;
 #endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
 #else
        static constexpr int ne = I * J / WARP_SIZE;
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
        static constexpr __device__ bool supported() {
            if (I ==  8 && J ==  8) return true;
            if (I == 16 && J ==  4) return true;
@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
        }
 #elif defined(AMD_WMMA_AVAILABLE)
        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+#if defined(RDNA4)
-
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
 #elif defined(RDNA3)
                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
 #else
                NO_DEVICE_CODE;
 #endif // defined(RDNA4)
        } else if constexpr (std::is_same_v<T, int>) {
            if constexpr (I == 16 && J == 4) {
                int64_t * xi = (int64_t *) t.x;
@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
 #elif defined(RDNA3)
        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
        using floatx8_t = __attribute__((ext_vector_type(8))) float;
        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
        const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
        const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
        const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
        const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
 #elif defined(RDNA3)
        using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
        using floatx8_t = __attribute__((ext_vector_type(8))) float;
        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
        const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
        const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
            return false;
        }
    } else {
-        if (src1_ncols > 16) {
+        if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
            return false;
        } else if (src1_ncols > 16) {
            return false;
        }
    }
@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
        case GGML_TYPE_F32:
            return ampere_mma_available(cc);
        case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
        case GGML_TYPE_BF16:
-            return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return ampere_mma_available(cc) || amd_wmma_available(cc);
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                return ne11 <= 8;
            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
                        return ne11 <= 3;
                    }
                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
                        return ne11 <= 5;
                    }
                    return ne11 <= 2;
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@ -3,6 +3,80 @@
 #include "solve_tri.cuh"
 #define MAX_N_FAST 64
 #define MAX_K_FAST 32
 static __global__ void get_batch_pointers(const float *  A,
                                          float *        X,
                                          const float ** A_ptrs,
                                          float **       X_ptrs,
                                          int64_t        ne02,
                                          int64_t        total_batches,
                                          size_t         s02,
                                          size_t         s03,
                                          size_t         s2,
                                          size_t         s3) {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= total_batches) {
        return;
    }
    const int64_t i3 = idx / ne02;
    const int64_t i2 = idx % ne02;
    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
 }
 static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
                                 const float *               A,
                                 const float *               B,
                                 float *                     X,
                                 int                         n,
                                 int                         k,
                                 int64_t                     ne02,
                                 int64_t                     ne03,
                                 size_t                      s02,
                                 size_t                      s03,
                                 size_t                      s12,
                                 size_t                      s13,
                                 size_t                      s2,
                                 size_t                      s3,
                                 cudaStream_t                stream) {
    const float   alpha         = 1.0f;
    const int64_t total_batches = ne02 * ne03;
    if (total_batches == 0) {
        return;
    }
    // Bulk copy B -> X (contiguous tensors)
    if (X != B) {
        const int64_t total_elements_BX = n * k * total_batches;
        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
    }
    const int id = ggml_cuda_get_device();
    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
    const float ** A_ptrs_dev = A_ptrs_alloc.get();
    float **       X_ptrs_dev = X_ptrs_alloc.get();
    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
                                                                        total_batches, s02, s03, s2, s3);
    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
    // Yes, this is necessary, without this we get RMSE errors
    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
    // revert to standard mode from common.cuh
    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
    GGML_UNUSED_VARS(s12, s13);
 }
 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
    float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
-    const int half = WARP_SIZE;
+    const int half      = WARP_SIZE;
    const int nrows_low = (n < half) ? n : half;
 #pragma unroll
@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
 #pragma unroll
    for (int row = half; row < n; ++row) {
-        float sum = sA[row * n + lane] * x_low;
+        float     sum = sA[row * n + lane] * x_low;
-        const int j = half + lane;
+        const int j   = half + lane;
        if (j < row) {
            sum += sA[row * n + j] * x_high;
        }
@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    for (int rr = 0; rr < 2; ++rr) {
        const int row = rr * WARP_SIZE + lane;
        if (row < n) {
-            const float val = (row < half) ? x_low : x_high;
+            const float val            = (row < half) ? x_low : x_high;
            X_batch[row * k + col_idx] = val;
        }
    }
@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
 }
 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
+    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src1 = dst->src[1];  // B (n×k)
    ggml_is_contiguous(src0);
    ggml_is_contiguous(src1);
-    const int64_t n = src0->ne[0];
+    const int64_t n    = src0->ne[0];
-    const int64_t k = src1->ne[0];
+    const int64_t k    = src1->ne[0];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
-    GGML_ASSERT(n <= 64);
+    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
-    GGML_ASSERT(k <= 32);
+        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
-
+                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
+                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                           dst->nb[3] / sizeof(float), ctx.stream());
-                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+    } else {
-                       dst->nb[3] / sizeof(float), ctx.stream());
+        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
                             dst->nb[3] / sizeof(float), ctx.stream());
    }
 }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -19,6 +19,9 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
 #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@ -30,6 +33,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
 #define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -12,11 +12,16 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
 #define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
 #define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
 #define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
 #define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
 #define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -659,6 +659,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_log[2];
    vk_pipeline pipeline_tri[2];
    vk_pipeline pipeline_diag[2];
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_roll_f32;
@ -722,6 +723,11 @@ struct vk_device_struct {
    vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
    vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
    vk_pipeline pipeline_soft_max_back_f32;
    vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
    vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
    vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
    vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
    vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@ -757,7 +763,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_flash_attn_split_k_reduce;
-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
+    // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
    std::vector<vk_pipeline_ref> all_pipelines;
@ -1149,6 +1156,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
 struct vk_op_topk_moe_push_constants {
    uint32_t n_rows;
    uint32_t n_experts_push;
    uint32_t n_expert_used;
    float clamp_min;
    float clamp_max;
@ -3730,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS],  "get_rows_iq4_xs",  get_rows_iq4_xs_len,  get_rows_iq4_xs_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl",  get_rows_iq4_nl_len,  get_rows_iq4_nl_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4],   "get_rows_mxfp4",   get_rows_mxfp4_len,   get_rows_mxfp4_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32],     "get_rows_i32",     get_rows_i32_len,     get_rows_i32_data,     "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -3917,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
@ -3996,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32,     "soft_max_large1_f32",     soft_max_large1_f32_len,     soft_max_large1_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32,     "soft_max_large2_f32",     soft_max_large2_f32_len,     soft_max_large2_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32,     "soft_max_large3_f32",     soft_max_large3_f32_len,     soft_max_large3_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@ -4204,10 +4223,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
-    for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
+    for (uint32_t use_push = 0; use_push < 2; ++use_push) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
+        for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
        }
    }
    for (auto &c : compiles) {
@ -8274,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    switch (op) {
    case GGML_OP_GET_ROWS:
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
        if (src0->type == GGML_TYPE_I32) {
            // i32 src only supports i32 result
            GGML_ASSERT(dst->type == GGML_TYPE_I32);
            return ctx->device->pipeline_get_rows[src0->type];
        }
        if (dst->type == GGML_TYPE_F16) {
            return ctx->device->pipeline_get_rows[src0->type];
        }
@ -8400,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
        }
        return nullptr;
    case GGML_OP_DIAG:
        if (src0->type == dst->type &&
            (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
            return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
        }
        return nullptr;
    case GGML_OP_CLAMP:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_clamp_f32;
@ -8554,7 +8586,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
            GGML_ASSERT(idx < num_topk_moe_pipelines);
            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-            return ctx->device->pipeline_topk_moe[idx][mode];
+            // use n_experts from push constant if it's not equal to the power of two spec constant
            bool use_push = dst->ne[0] != (1u << idx);
            return ctx->device->pipeline_topk_moe[idx][mode][use_push];
        }
        if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@ -9091,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_COS:
    case GGML_OP_LOG:
    case GGML_OP_TRI:
    case GGML_OP_DIAG:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_ROLL:
@ -9778,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
 }
 static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
 }
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
    p.param1 = ggml_get_op_params_f32(dst, 0);
@ -10111,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-    ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
+    vk_op_soft_max_push_constants pc {
        ncols,
        src1 != nullptr ? nrows_y : (uint32_t)0,
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
@ -10122,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
        n_head_log2,
        nrows_x,
        src2 != nullptr
-    });
+    };
    if (ncols <= 16384) {
        ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
    } else {
        vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
        vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
        vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
        vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
        uint32_t elems_per_wg = 128 * 4;
        uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
        size_t tmp_size = num_wgs * nrows_x * sizeof(float);
        if (ctx->prealloc_size_x < tmp_size) {
            ctx->prealloc_size_x = tmp_size;
            ggml_vk_preallocate_buffers(ctx, subctx);
        }
        if (ctx->prealloc_size_y < tmp_size) {
            ctx->prealloc_size_y = tmp_size;
            ggml_vk_preallocate_buffers(ctx, subctx);
        }
        if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
            ggml_vk_sync_buffers(ctx, subctx);
        }
        vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
        vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
        std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
        vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
        vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
        vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
        ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
        ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
        ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
        ggml_vk_sync_buffers(ctx, subctx);
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
        ggml_vk_sync_buffers(ctx, subctx);
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
        ctx->prealloc_x_need_sync = true;
        ctx->prealloc_y_need_sync = true;
    }
 }
 static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -10158,6 +10247,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
    vk_op_topk_moe_push_constants pc {};
    pc.n_rows = n_rows;
    pc.n_experts_push = n_experts;
    pc.n_expert_used = n_expert_used;
    if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
        ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
@ -11857,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_TRI:
        ggml_vk_tri(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_DIAG:
        ggml_vk_diag(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_CLAMP:
        ggml_vk_clamp(ctx, compute_ctx, src0, node);
@ -12832,8 +12926,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
    }
    const int n_expert = softmax->ne[0];
-    // n_expert must be a power of 2
+    if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
    if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
        return false;
    }
@ -13877,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_IQ4_XS:
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_MXFP4:
                    case GGML_TYPE_I32:
                        return true;
                    default:
                        return false;
@ -14001,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_LOG:
        case GGML_OP_TRI:
        case GGML_OP_DIAG:
            return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                   op->type == op->src[0]->type;
        case GGML_OP_ARGSORT:
@ -14591,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_TRI) {
            tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
        } else if (tensor->op == GGML_OP_DIAG) {
            tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_CLAMP) {
            const float * params = (const float *)tensor->op_params;
            tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
@ -0,0 +1,29 @@
 #version 450
 #include "rte.glsl"
 #include "types.glsl"
 #include "generic_unary_head.glsl"
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
    const uint i12_offset = i12*p.ne11*p.ne10;
    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    if (i10 == i11) {
        const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
    } else {
        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@ -256,6 +256,9 @@ void main() {
        barrier();
    }
    // prevent race on tmpsh
    barrier();
    // reduce across threads
    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@ -302,6 +302,9 @@ void main() {
        barrier();
    }
    // prevent race on tmpsh
    barrier();
    // reduce across threads
    float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@ -26,9 +26,9 @@ void main() {
            const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 #if defined(DATA_A_BF16)
-            FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
+            TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
 #else
-            FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
+            TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
 #endif
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
            data_d[d_offset + i00] = D_TYPE(v);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp
@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
-void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
-    const uint y_idx = i * QUANT_K + 32 * ib32;
+                     const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
    const uint y_idx_base = i * QUANT_K + 32 * ib32;
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {    
        const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4;
        [[unroll]] for (uint l = 0; l < 4; ++l) {            
            const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]);
            const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
-    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+            // index for data_a
-    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
        const float d = float(data_a[ibi].d);
        const uint qh = data_a[ibi].qh[ib32];
        const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
        const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
-        [[unroll]] for (uint l = 0; l < 4; ++l) {
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {                
-            const uint qs = data_a[ibi].qs[4 * ib32 + l];
+                const float d = float(data_a[ibi].d);
-            const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
+                const uint qh = data_a[ibi].qh[ib32];
            const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
-                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                const uint qs = data_a[ibi].qs[4 * ib32 + l];
-                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+                const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);              
                const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]);
-                FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+                const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
-                [[unroll]] for (int k = 0; k < 4; ++k) {
+                const vec4 delta_v = vec4(delta_val);				
-                    sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
+                const vec4 fbits0 = vec4(
-                          fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
+                    float(bitfieldExtract(grid, 0, 2)),
-                }
+                    float(bitfieldExtract(grid, 2, 2)),
-                temp[j][n] = fma(dl, sum, temp[j][n]);
+                    float(bitfieldExtract(grid, 4, 2)),
                    float(bitfieldExtract(grid, 6, 2))
                );				
                const vec4 fbits1 = vec4(
                    float(bitfieldExtract(grid, 8, 2)),
                    float(bitfieldExtract(grid, 10, 2)),
                    float(bitfieldExtract(grid, 12, 2)),
                    float(bitfieldExtract(grid, 14, 2))
                );
                vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0));
                sum_v      = fma(b_val_1, fbits1 + delta_v, sum_v);
 				FLOAT_TYPE sum = dot(sum_v, vec4(1.0));
                temp[j][n] = fma(dl, sum, temp[j][n]);				
                ibi += num_blocks_per_row;
            }
        }
        ibi += num_blocks_per_row;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint iqs = idx % 128;                 // 0..127
            const uint n = iqs / 64;                    // 0,1
-            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint b = ((iqs % 64) / 32) * 4;       // 0,4
            const uint is_b = (iqs % 16) / 8;           // 0,1
            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
            const uint is = 8 * n + qhshift + is_b;     // 0..15
-            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
+            const uint qsi = n * 32 + (iqs % 32);       // 0..63
-            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+            const uint qhi = n * 16 + (iqs % 16);       // 0..31
            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32),
+            const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F;
-                                             dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+            const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
            const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;
            buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
 #elif defined(DATA_A_IQ1_S)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
@ -0,0 +1,62 @@
 #version 450
 #include "soft_max_large_common.glsl"
 void main() {
    const uint tid = gl_LocalInvocationID.x;
    const uint rowx = gl_WorkGroupID.y;
    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
    const uint32_t i01 = rowx % p.ne01;
    uint rowy_start = 0;
    if (p.KY > 0) {
        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
    }
    if (rowx >= p.nrows_x) {
        return;
    }
    float slope = get_slope(rowx);
    // Find max
    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
        const uint col = col0 + tid;
        FLOAT_TYPE a = FLOAT_TYPE(0);
        if (col < p.KX) {
            a = data_a[rowx * p.KX + col];
        }
        FLOAT_TYPE b = FLOAT_TYPE(0);
        if (p.KY > 0 && col < p.KX) {
            b = data_b[rowy_start + col];
        }
        FLOAT_TYPE v = a * p.scale + slope * b;
        if (col < p.KX) {
            max_val = max(max_val, v);
        }
    }
    // reduce across the workgroup
    vals[tid] = max_val;
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            vals[tid] = max(vals[tid], vals[tid + s]);
        }
        barrier();
    }
    if (tid == 0) {
        max_val = vals[0];
        data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
@ -0,0 +1,79 @@
 #version 450
 #include "soft_max_large_common.glsl"
 void main() {
    const uint tid = gl_LocalInvocationID.x;
    const uint rowx = gl_WorkGroupID.y;
    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
    const uint32_t i01 = rowx % p.ne01;
    uint rowy_start = 0;
    if (p.KY > 0) {
        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
    }
    if (rowx >= p.nrows_x) {
        return;
    }
    float slope = get_slope(rowx);
    // Find max
    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
        if (i + tid < gl_NumWorkGroups.x) {
            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
        }
    }
    // reduce across the workgroup
    vals[tid] = max_val;
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            vals[tid] = max(max_val, vals[tid + s]);
        }
        barrier();
    }
    max_val = vals[0];
    barrier();
    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
    // Compute sum{exp(x - max)}
    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
        const uint col = col0 + tid;
        if (col >= p.KX) {
            break;
        }
        // compute exp(a*scale+b*slope), add it to sum
        const uint i = rowx * p.KX + col;
        FLOAT_TYPE val;
        val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
        sum += val;
        data_d[i] = D_TYPE(val);
    }
    // reduce across the workgroup
    vals[tid] = sum;
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            vals[tid] += vals[tid + s];
        }
        barrier();
    }
    if (tid == 0) {
        sum = vals[0];
        data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
@ -0,0 +1,65 @@
 #version 450
 #include "soft_max_large_common.glsl"
 shared FLOAT_TYPE sumsh[BLOCK_SIZE];
 void main() {
    const uint tid = gl_LocalInvocationID.x;
    const uint rowx = gl_WorkGroupID.y;
    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
    const uint32_t i01 = rowx % p.ne01;
    uint rowy_start = 0;
    if (p.KY > 0) {
        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
    }
    if (rowx >= p.nrows_x) {
        return;
    }
    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
        if (i + tid < gl_NumWorkGroups.x) {
            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
            sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
        }
    }
    // reduce across the workgroup
    vals[tid] = max_val;
    sumsh[tid] = sum;
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            vals[tid] = max(max_val, vals[tid + s]);
            sumsh[tid] += sumsh[tid + s];
        }
        barrier();
    }
    max_val = vals[0];
    sum = sumsh[0];
    if (p.has_sinks != 0) {
        sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
    }
    FLOAT_TYPE rcpdivisor = 1.0/sum;
    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
        const uint col = col0 + tid;
        if (col >= p.KX) {
            continue;
        }
        data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
@ -0,0 +1,53 @@
 #extension GL_EXT_control_flow_attributes : enable
 layout (push_constant) uniform parameter
 {
    uint KX;
    uint KY;
    uint ne00;
    uint ne01;
    uint ne02;
    uint ne12;
    uint ne13;
    uint nb11;
    uint nb12;
    uint nb13;
    float scale;
    float max_bias;
    float m0;
    float m1;
    uint n_head_log2;
    uint nrows_x;
    uint has_sinks;
 } p;
 #include "types.glsl"
 layout(constant_id = 0) const uint BLOCK_SIZE = 128;
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout(constant_id = 1) const uint num_iters = 4;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
 layout (binding = 2) readonly buffer Z {float data_c[];};
 layout (binding = 3) buffer D {D_TYPE data_d[];};
 layout (binding = 4) buffer M {float data_m[];};
 layout (binding = 5) buffer S {float data_s[];};
 shared FLOAT_TYPE vals[BLOCK_SIZE];
 float get_slope(uint rowx) {
    float slope = 1.0f;
    // ALiBi
    if (p.max_bias > 0.0f) {
        const uint h = (rowx / p.ne01) % p.ne02; // head index
        const float base = h < p.n_head_log2 ? p.m0 : p.m1;
        const uint   exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
        slope = pow(base, exp);
    }
    return slope;
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@ -10,6 +10,7 @@
 layout (push_constant) uniform parameter
 {
    uint n_rows;
    uint n_experts_push;
    uint n_expert_used;
    float clamp_min;
    float clamp_max;
@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 layout(constant_id = 0) const uint WARP_SIZE = 32;
-layout(constant_id = 1) const uint n_experts = 512;
+layout(constant_id = 1) const uint n_experts_spec = 512;
 layout(constant_id = 2) const bool with_norm = true;
 layout(constant_id = 3) const bool late_softmax = false;
 layout(constant_id = 4) const bool nexperts_use_push = false;
-const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
+uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
 layout (binding = 0, std430) readonly buffer Logits {float logits[];};
 layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
@ -94,7 +100,7 @@ void main() {
    }
    if (!late_softmax) {
-        softmax_warp_inplace(wt, n_experts, lane, false);
+        softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
    }
    // at this point, each thread holds a portion of softmax,
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -704,13 +704,15 @@ void process_shaders() {
        shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
        if (tname == "f16") {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
        } else {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
        }
-        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
+        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
    }
    string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
    string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
    string_to_spv("mul_mat_vec_p021_f16_f32",              "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
@ -854,6 +856,8 @@ void process_shaders() {
    string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("diag_f16",       "diag.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("diag_f32",       "diag.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
@ -899,6 +903,13 @@ void process_shaders() {
    string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@ -1,5 +1,5 @@
 {
-  "extraPaths": ["gguf-py"],
+  "extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
  "pythonVersion": "3.9",
  "pythonPlatform": "All",
  "reportUnusedImport": "warning",
--- a/scripts/compare-logprobs.py
+++ b/scripts/compare-logprobs.py
@ -0,0 +1,281 @@
 import argparse
 import requests
 import json
 from pathlib import Path
 import logging
 logger = logging.getLogger("compare-logprobs")
 logging.basicConfig(level=logging.INFO)
 DESCRIPTION = """
 Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints.
 Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally.
 Example usage:
    Step 1: Dump logits from two different servers
        python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions
        python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions
        (optionally, you can add --api-key <key> if the endpoint requires authentication)
    Step 2: Compare the dumped logits
        python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md
 """
 def generate_input_prompt(length: int) -> list[str]:
    CORPUS = """
    You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
    ### Tool Call Format:
    When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
    You can make multiple calls in one go by placing them one after another.
    """
    words = [w.strip() for w in CORPUS.strip().split(" ")]
    words = [w for w in words if len(w) > 0]  # filter out empty strings
    while len(words) < length:
        words += words
    return words[:length]
 def dump_logits(
    endpoint: str,
    output_path: Path,
    input_words: list[str],
    pattern: list[tuple[bool, int]],
    api_key=None,
 ):
    logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...")
    words = input_words
    curr_text = ""
    n_total = sum(n for get, n in pattern if get)
    n_done = 0
    i_cur = 0
    i_total = len(words)
    with output_path.open("w") as f:
        for get, n in pattern:
            if not get:
                # skip n words
                for i in range(n):
                    curr_text += words.pop(0) + " "
                    i_cur += 1
                continue
            # get n words
            for i in range(n):
                curr_text += words.pop(0) + " "
                payload = {
                    "prompt": curr_text.strip(),
                    "temperature": 0.0,
                    "top_k": 1,
                    "max_tokens": 1,
                    "logprobs": 1,
                    "stream": False,
                }
                response = requests.post(
                    endpoint,
                    json=payload,
                    headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
                )
                response.raise_for_status()
                data = response.json()
                data["__index"] = i_cur  # add index for easier debugging later
                data = json.dumps(data)
                f.write(f"{data}\n")
                n_done += 1
                i_cur += 1
                logger.info(
                    f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]"
                )
    logger.info(f"Logits dumped to {output_path}")
 def get_token_logprobs(data: dict):
    logprobs = data["choices"][0]["logprobs"]
    if "content" in logprobs:
        # llama.cpp case
        top = logprobs["content"][0]["top_logprobs"][0]
        return top["token"], top["logprob"]
    else:
        # vllm case
        tokens = logprobs["tokens"]
        token_logprobs = logprobs["token_logprobs"]
        return tokens[0], token_logprobs[0]
 def clean_text(text: str) -> str:
    return (
        "'"
        + text.replace("\n", "\\n")
        .replace("\t", "\\t")
        .replace("\r", "\\r")
        .replace("|", "\\|")
        + "'"
    )
 def compare_logits(input1: Path, input2: Path, output_path: Path):
    with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
        tab_header = [
            "idx",
            input1.name,
            "logprob_1",
            input2.name,
            "logprob_2",
            "diff (abs)",
        ]
        tab_entries = []
        tab_max_widths = [len(h) for h in tab_header]
        assert len(lines1) == len(
            lines2
        ), "Input files must have the same number of lines."
        fout.write("# Logits Comparison Report\n\n")
        for i, (line1, line2) in enumerate(zip(lines1, lines2)):
            if not line1.strip() or not line2.strip():
                continue  # skip empty lines
            data1 = json.loads(line1)
            data2 = json.loads(line2)
            idx1 = data1.get("__index", -1)
            idx2 = data2.get("__index", -1)
            if idx1 != idx2:
                logger.warning(
                    f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}"
                )
            token1, logprob1 = get_token_logprobs(data1)
            token2, logprob2 = get_token_logprobs(data2)
            token1 = clean_text(token1)
            token2 = clean_text(token2)
            abs_diff = abs(logprob1 - logprob2)
            tab_entries.append(
                (
                    str(idx1 + 1),
                    token1,
                    f"{logprob1:.4f}",
                    token2,
                    f"{logprob2:.4f}",
                    f"{(abs_diff):.4f}",
                )
            )
        for i in range(len(tab_entries)):
            for j in range(len(tab_header)):
                tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j]))
        output = ""
        for j in range(len(tab_header)):
            output += f"| {tab_header[j]:<{tab_max_widths[j]}} "
        output += "|\n"
        for j in range(len(tab_header)):
            output += f"|{'-' * (tab_max_widths[j] + 2)}"
        output += "|\n"
        for entry in tab_entries:
            for j in range(len(tab_header)):
                output += f"| {entry[j]:<{tab_max_widths[j]}} "
            output += "|\n"
        logger.info("\n" + output)
        fout.write(output)
        logger.info(f"Report written to {output_path}")
 def parse_pattern(pattern: str) -> list[tuple[bool, int]]:
    parts = pattern.split(",")
    result = []
    for i, part in enumerate(parts):
        n = int(part)
        if i % 2 == 0:
            result.append((True, n))  # get n words
        else:
            result.append((False, n))  # skip n words
    return result
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter
    )
    subparsers = parser.add_subparsers(
        dest="verb", required=True, help="action to perform"
    )
    # dump subcommand
    parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint")
    parser_dump.add_argument(
        "output", type=Path, help="output path for dumped logits (.log)"
    )
    parser_dump.add_argument(
        "endpoint", type=str, help="OAI-compat /completions endpoint"
    )
    parser_dump.add_argument(
        "--api-key",
        type=str,
        default=None,
        help="API key for authentication (if required)",
    )
    parser_dump.add_argument(
        "--file",
        type=Path,
        default=None,
        help="File containing prompt to use instead of the default",
    )
    parser_dump.add_argument(
        "--pattern",
        type=str,
        default="10,1000,10,4000,10",
        help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)",
    )
    # compare subcommand
    parser_compare = subparsers.add_parser(
        "compare", help="compare two dumped logits files"
    )
    parser_compare.add_argument("input1", type=Path, help="first input file (.log)")
    parser_compare.add_argument("input2", type=Path, help="second input file (.log)")
    parser_compare.add_argument(
        "output", type=Path, help="output path for comparison report (.md)"
    )
    try:
        return parser.parse_args()
    except Exception as e:
        parser.print_help()
        raise e
 def main():
    args = parse_args()
    if args.verb == "dump":
        pattern = parse_pattern(args.pattern)
        input_length = sum(n for _, n in pattern)
        input_words = generate_input_prompt(input_length)
        if args.file is not None:
            with args.file.open("r") as f:
                input_words = f.read().strip().split(" ")
                if input_length < sum(n for _, n in pattern):
                    raise ValueError(
                        f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
                    )
                input_length = len(input_words)
        logger.info(f"Using {input_length} words")
        dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
    elif args.verb == "compare":
        compare_logits(args.input1, args.input2, args.output)
    else:
        raise ValueError(f"Unknown verb: {args.verb}")
 if __name__ == "__main__":
    main()
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-55bc9320a4aae82af18e23eefd5de319a755d7b9
+130bc125a88bb57664b88932c48c38a1cb316fac
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -9,6 +9,7 @@
 #include "llama-model.h"
 #include <cinttypes>
 #include <cmath>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
@ -91,6 +92,43 @@ llama_context::llama_context(
        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
    }
    if (cparams.yarn_ext_factor != 0) {
        static auto get_mscale = [](float scale, float mscale) {
            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
        };
        const float factor = 1.0f / cparams.rope_freq_scale;
        // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
        if (hparams.rope_yarn_log_mul != 0.0f) {
            // note: here we assume `mscale == 1.0f`
            // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
                  float mscale          = 1.0f;
            const float mscale_all_dims = hparams.rope_yarn_log_mul;
            // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            // special-case DEEPSEEK v2:
            // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
            if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
                mscale = mscale_all_dims;
            }
            cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
            LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
                    __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
        } else {
            cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
        }
        // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
        // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
        //
        // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
        //      https://github.com/ggml-org/llama.cpp/pull/17945
        cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
    }
    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@ -1728,6 +1766,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
            LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
            synchronize();
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -1,6 +1,7 @@
 #include "llama-hparams.h"
 #include "ggml.h"
 #include <cassert>
 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -107,6 +107,7 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@ -270,4 +271,3 @@ struct llama_hparams {
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                      float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;
    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
                                    : cparams.yarn_attn_factor;
    ggml_tensor * tmp;
    if (ggml_is_quantized(cur->type)) {
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+
                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
                    // cancel the factor from the convert script
                    hparams.rope_yarn_log_mul /= 0.1f;
                }
                // (optional) temperature tuning - used by mistral-large
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
@ -2267,9 +2272,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                if (hparams.f_attn_temp_scale != 0.0f) {
@ -2279,18 +2284,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }
                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
                //       but may need further verification with other values
                if (hparams.rope_yarn_log_mul != 0.0f) {
                    float factor = 1.0f / hparams.rope_freq_scale_train;
                    float mscale = 1.0f;
                    float mscale_all_dims = hparams.rope_yarn_log_mul;
                    static auto get_mscale = [](float scale, float mscale) {
                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
                    };
                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
                }
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_3B; break;
                    case 34: type = LLM_TYPE_8B; break;
@ -6806,6 +6799,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@ -6869,7 +6863,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }
    if (arch == LLM_ARCH_QWEN2MOE) {
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@ -1,7 +1,5 @@
 #include "models.h"
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+
-    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
    GGML_ASSERT(ext_factor >= 0.0f);
    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
    // use the original attn_factor to pre-scale the kq_scale
    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -20,20 +20,20 @@ int main(void) {
            std::unordered_set<std::string> seen_env_vars;
            for (const auto & opt : ctx_arg.options) {
                // check for args duplications
-                for (const auto & arg : opt.args) {
+                for (const auto & arg : opt.get_args()) {
                    if (seen_args.find(arg) == seen_args.end()) {
                        seen_args.insert(arg);
                    } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
                        exit(1);
                    }
                }
                // check for env var duplications
-                if (opt.env) {
+                for (const auto & env : opt.get_env()) {
-                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
-                        seen_env_vars.insert(opt.env);
+                        seen_env_vars.insert(env);
                    } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
                        exit(1);
                    }
                }
@ -72,6 +72,10 @@ int main(void) {
    argv = {"binary_name", "--draft", "123"};
    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
    // negated arg
    argv = {"binary_name", "--no-mmap"};
    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    printf("test-arg-parser: test valid usage\n\n");
@ -115,6 +119,14 @@ int main(void) {
    assert(params.model.path == "blah.gguf");
    assert(params.cpuparams.n_threads == 1010);
    printf("test-arg-parser: test negated environment variables\n\n");
    setenv("LLAMA_ARG_MMAP", "0", true);
    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
    argv = {"binary_name"};
    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.use_mmap == false);
    assert(params.no_perf == true);
    printf("test-arg-parser: test environment variables being overwritten\n\n");
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -7655,6 +7655,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,  true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
    for (float max_bias : {0.0f, 8.0f}) {
        for (float scale : {1.0f, 0.1f}) {
            for (int64_t ne0 : {16, 1024}) {
@ -7864,9 +7867,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
    for (bool v : {false, true}) {
        for (bool circular : {false, true}) {
@ -7959,8 +7977,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    for (bool with_norm : {false, true}) {
        test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
        test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm));
        test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
        test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm));
        test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm));
        test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
        test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm));
    }
    test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
@ -8067,12 +8089,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    // qwen3next with CHUNK_SIZE 64
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
    // qwen3next with CHUNK_SIZE 128
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@ -6,11 +6,25 @@ add_library(mtmd
            mtmd.cpp
            mtmd-audio.cpp
            mtmd.h
            mtmd-helper.cpp
            mtmd-helper.h
            clip.cpp
            clip.h
            clip-impl.h
-            mtmd-helper.cpp
+            clip-model.h
-            mtmd-helper.h
+            clip-graph.h
            models/models.h
            models/cogvlm.cpp
            models/internvl.cpp
            models/kimivl.cpp
            models/llama4.cpp
            models/llava.cpp
            models/minicpmv.cpp
            models/pixtral.cpp
            models/qwen2vl.cpp
            models/qwen3vl.cpp
            models/siglip.cpp
            models/whisper-enc.cpp
            )
 set_target_properties(mtmd PROPERTIES
@ -53,6 +67,15 @@ if (TARGET BUILD_INFO)
    add_dependencies(mtmd-helper BUILD_INFO)
 endif()
 # if mtmd is linked against common, we throw an error
 if (TARGET mtmd)
    get_target_property(libs mtmd LINK_LIBRARIES)
    if (libs AND "common" IN_LIST libs)
        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
                            "It must not link against common")
    endif()
 endif()
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@ -0,0 +1,115 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-cpp.h"
 #include "clip.h"
 #include "clip-impl.h"
 #include "clip-model.h"
 #include <vector>
 #include <functional>
 struct clip_graph {
    const clip_model & model;
    const clip_hparams & hparams;
    projector_type proj_type;
    // we only support single image per batch
    const clip_image_f32 & img;
    const int patch_size;
    const int n_patches_x;
    const int n_patches_y;
    const int n_patches;
    const int n_embd;
    const int n_head;
    const int d_head;
    const int n_layer;
    const int n_mmproj_embd;
    const float eps;
    const float kq_scale;
    const clip_flash_attn_type flash_attn_type;
    // for debugging
    const bool debug_graph;
    std::vector<ggml_tensor *> & debug_print_tensors;
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
    virtual ~clip_graph() = default;
    virtual ggml_cgraph * build() = 0;
    //
    // utility functions
    //
    void cb(ggml_tensor * cur0, const char * name, int il) const;
    // siglip2 naflex
    ggml_tensor * resize_position_embeddings();
    // build vision transformer (ViT) cgraph
    // this function should cover most of the models
    // if your model has specific features, you should probably duplicate this function
    ggml_tensor * build_vit(
                ggml_tensor * inp,
                int64_t n_pos,
                norm_type norm_t,
                ffn_op_type ffn_t,
                ggml_tensor * learned_pos_embd,
                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
    // build the input after conv2d (inp_raw --> patches)
    // returns tensor with shape [n_embd, n_patches]
    ggml_tensor * build_inp();
    ggml_tensor * build_inp_raw(int channels = 3);
    ggml_tensor * build_norm(
            ggml_tensor * cur,
            ggml_tensor * mw,
            ggml_tensor * mb,
            norm_type type,
            float norm_eps,
            int il) const;
    ggml_tensor * build_ffn(
            ggml_tensor * cur,
            ggml_tensor * up,
            ggml_tensor * up_b,
            ggml_tensor * gate,
            ggml_tensor * gate_b,
            ggml_tensor * down,
            ggml_tensor * down_b,
            ffn_op_type type_op,
            int il) const;
    ggml_tensor * build_attn(
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_mask,
            float kq_scale,
            int il) const;
    // implementation of the 2D RoPE without adding a new op in ggml
    // this is not efficient (use double the memory), but works on all backends
    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
    ggml_tensor * build_rope_2d(
        ggml_context * ctx0,
        ggml_tensor * cur,
        ggml_tensor * pos_a, // first half
        ggml_tensor * pos_b, // second half
        const float freq_base,
        const bool interleave_freq
    );
    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
    // support dynamic resolution
    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
 };
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -1,3 +1,5 @@
 #pragma once
 #include "ggml.h"
 #include "gguf.h"
 #include "clip.h"
@ -13,6 +15,8 @@
 // Internal header for clip.cpp
 #define MTMD_INTERNAL_HEADER
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
@ -132,6 +136,10 @@
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 // forward declaration
 // TODO: improve this later
 struct clip_ctx;
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -0,0 +1,279 @@
 #pragma once
 #include "ggml.h"
 #include "clip.h"
 #include "clip-impl.h"
 #include <vector>
 #include <unordered_set>
 #include <cstdint>
 #include <cmath>
 enum ffn_op_type {
    FFN_GELU,
    FFN_GELU_ERF,
    FFN_SILU,
    FFN_GELU_QUICK,
 };
 enum norm_type {
    NORM_TYPE_NORMAL,
    NORM_TYPE_RMS,
 };
 enum patch_merge_type {
    PATCH_MERGE_FLAT,
    PATCH_MERGE_SPATIAL_UNPAD,
 };
 struct clip_hparams {
    int32_t image_size = 0;
    int32_t patch_size = 0;
    int32_t n_embd = 0;
    int32_t n_ff = 0;
    int32_t projection_dim = 0;
    int32_t n_head = 0;
    int32_t n_layer = 0;
    // idefics3
    int32_t image_longest_edge = 0;
    int32_t image_min_pixels = -1;
    int32_t image_max_pixels = -1;
    int32_t n_merge = 0; // number of patch merges **per-side**
    float image_mean[3];
    float image_std[3];
    // for models using dynamic image size, we need to have a smaller image size to warmup
    // otherwise, user will get OOM everytime they load the model
    int32_t warmup_image_size = 0;
    int32_t warmup_audio_size = 3000;
    ffn_op_type ffn_op = FFN_GELU;
    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
    float eps = 1e-6;
    float rope_theta = 0.0;
    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
    int32_t image_crop_resolution;
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
    int32_t proj_stack_factor = 0; // ultravox
    // legacy
    bool has_llava_projector = false;
    int minicpmv_version = 0;
    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
    // custom value provided by user, can be undefined if not set
    int32_t custom_image_min_tokens = -1;
    int32_t custom_image_max_tokens = -1;
    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
        const int cur_merge = n_merge == 0 ? 1 : n_merge;
        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
    }
    void set_warmup_n_tokens(int n_tokens) {
        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
        const int cur_merge = n_merge == 0 ? 1 : n_merge;
        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
        // TODO: support warmup size for custom token numbers
    }
 };
 struct clip_layer {
    // attention
    ggml_tensor * k_w = nullptr;
    ggml_tensor * k_b = nullptr;
    ggml_tensor * q_w = nullptr;
    ggml_tensor * q_b = nullptr;
    ggml_tensor * v_w = nullptr;
    ggml_tensor * v_b = nullptr;
    ggml_tensor * qkv_w = nullptr;
    ggml_tensor * qkv_b = nullptr;
    ggml_tensor * o_w = nullptr;
    ggml_tensor * o_b = nullptr;
    ggml_tensor * k_norm = nullptr;
    ggml_tensor * q_norm = nullptr;
    // layernorm 1
    ggml_tensor * ln_1_w = nullptr;
    ggml_tensor * ln_1_b = nullptr;
    ggml_tensor * ff_up_w = nullptr;
    ggml_tensor * ff_up_b = nullptr;
    ggml_tensor * ff_gate_w = nullptr;
    ggml_tensor * ff_gate_b = nullptr;
    ggml_tensor * ff_down_w = nullptr;
    ggml_tensor * ff_down_b = nullptr;
    // layernorm 2
    ggml_tensor * ln_2_w = nullptr;
    ggml_tensor * ln_2_b = nullptr;
    // layer scale (no bias)
    ggml_tensor * ls_1_w = nullptr;
    ggml_tensor * ls_2_w = nullptr;
    // qwen3vl deepstack merger
    ggml_tensor * deepstack_norm_w = nullptr;
    ggml_tensor * deepstack_norm_b = nullptr;
    ggml_tensor * deepstack_fc1_w = nullptr;
    ggml_tensor * deepstack_fc1_b = nullptr;
    ggml_tensor * deepstack_fc2_w = nullptr;
    ggml_tensor * deepstack_fc2_b = nullptr;
    bool has_deepstack() const {
        return deepstack_fc1_w != nullptr;
    }
 };
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
    clip_hparams hparams;
    // embeddings
    ggml_tensor * class_embedding = nullptr;
    ggml_tensor * patch_embeddings_0 = nullptr;
    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
    ggml_tensor * patch_bias = nullptr;
    ggml_tensor * position_embeddings = nullptr;
    ggml_tensor * pre_ln_w = nullptr;
    ggml_tensor * pre_ln_b = nullptr;
    std::vector<clip_layer> layers;
    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
    ggml_tensor * post_ln_w;
    ggml_tensor * post_ln_b;
    ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
    ggml_tensor * mm_fc_w;
    ggml_tensor * mm_fc_b;
    // LLaVA projection
    ggml_tensor * mm_input_norm_w = nullptr;
    ggml_tensor * mm_input_norm_b = nullptr;
    ggml_tensor * mm_0_w = nullptr;
    ggml_tensor * mm_0_b = nullptr;
    ggml_tensor * mm_2_w = nullptr;
    ggml_tensor * mm_2_b = nullptr;
    ggml_tensor * image_newline = nullptr;
    // Yi type models with mlp+normalization projection
    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
    ggml_tensor * mm_1_b = nullptr;
    ggml_tensor * mm_3_w = nullptr;
    ggml_tensor * mm_3_b = nullptr;
    ggml_tensor * mm_4_w = nullptr;
    ggml_tensor * mm_4_b = nullptr;
    // GLMV-Edge projection
    ggml_tensor * mm_model_adapter_conv_w = nullptr;
    ggml_tensor * mm_model_adapter_conv_b = nullptr;
    // MobileVLM projection
    ggml_tensor * mm_model_mlp_1_w = nullptr;
    ggml_tensor * mm_model_mlp_1_b = nullptr;
    ggml_tensor * mm_model_mlp_3_w = nullptr;
    ggml_tensor * mm_model_mlp_3_b = nullptr;
    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
    // MobileVLM_V2 projection
    ggml_tensor * mm_model_mlp_0_w = nullptr;
    ggml_tensor * mm_model_mlp_0_b = nullptr;
    ggml_tensor * mm_model_mlp_2_w = nullptr;
    ggml_tensor * mm_model_mlp_2_b = nullptr;
    ggml_tensor * mm_model_peg_0_w = nullptr;
    ggml_tensor * mm_model_peg_0_b = nullptr;
    // MINICPMV projection
    ggml_tensor * mm_model_pos_embed_k = nullptr;
    ggml_tensor * mm_model_query = nullptr;
    ggml_tensor * mm_model_proj = nullptr;
    ggml_tensor * mm_model_kv_proj = nullptr;
    ggml_tensor * mm_model_attn_q_w = nullptr;
    ggml_tensor * mm_model_attn_q_b = nullptr;
    ggml_tensor * mm_model_attn_k_w = nullptr;
    ggml_tensor * mm_model_attn_k_b = nullptr;
    ggml_tensor * mm_model_attn_v_w = nullptr;
    ggml_tensor * mm_model_attn_v_b = nullptr;
    ggml_tensor * mm_model_attn_o_w = nullptr;
    ggml_tensor * mm_model_attn_o_b = nullptr;
    ggml_tensor * mm_model_ln_q_w = nullptr;
    ggml_tensor * mm_model_ln_q_b = nullptr;
    ggml_tensor * mm_model_ln_kv_w = nullptr;
    ggml_tensor * mm_model_ln_kv_b = nullptr;
    ggml_tensor * mm_model_ln_post_w = nullptr;
    ggml_tensor * mm_model_ln_post_b = nullptr;
    // gemma3
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;
    // pixtral
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
    // ultravox / whisper encoder
    ggml_tensor * conv1d_1_w = nullptr;
    ggml_tensor * conv1d_1_b = nullptr;
    ggml_tensor * conv1d_2_w = nullptr;
    ggml_tensor * conv1d_2_b = nullptr;
    ggml_tensor * mm_norm_pre_w = nullptr;
    ggml_tensor * mm_norm_mid_w = nullptr;
    // cogvlm
    ggml_tensor * mm_post_fc_norm_w = nullptr;
    ggml_tensor * mm_post_fc_norm_b = nullptr;
    ggml_tensor * mm_h_to_4h_w = nullptr;
    ggml_tensor * mm_gate_w = nullptr;
    ggml_tensor * mm_4h_to_h_w = nullptr;
    ggml_tensor * mm_boi = nullptr;
    ggml_tensor * mm_eoi = nullptr;
    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }
    bool audio_has_stack_frames() const {
        return proj_type == PROJECTOR_TYPE_ULTRAVOX
            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }
 };
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -7,6 +7,8 @@
 // !!! Internal header, to be used by mtmd only !!!
 #define MTMD_INTERNAL_HEADER
 struct clip_ctx;
 struct clip_image_size {
--- a/tools/mtmd/models/cogvlm.cpp
+++ b/tools/mtmd/models/cogvlm.cpp
@ -0,0 +1,98 @@
 #include "models.h"
 ggml_cgraph * clip_graph_cogvlm::build() {
    GGML_ASSERT(model.class_embedding != nullptr);
    GGML_ASSERT(model.position_embeddings != nullptr);
    const int n_pos = n_patches + 1; // +1 for [CLS]
    // build input and concatenate class embedding
    ggml_tensor * inp = build_inp();
    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
    inp = ggml_add(ctx0, inp, model.position_embeddings);
    cb(inp, "inp_pos", -1);
    ggml_tensor * inpL = inp;
    for (int il = 0; il < n_layer; il++) {
        auto & layer = model.layers[il];
        ggml_tensor * cur = inpL;
        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
        cur = ggml_add(ctx0, cur, layer.qkv_b);
        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
            cur->nb[1], 0);
        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
            cur->nb[1], n_embd * sizeof(float));
        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
            cur->nb[1], 2 * n_embd * sizeof(float));
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        cur = build_attn(layer.o_w, layer.o_b,
            Qcur, Kcur, Vcur, nullptr, kq_scale, il);
        cb(cur, "attn_out", il);
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
        cb(cur, "attn_post_norm", il);
        cur = ggml_add(ctx0, cur, inpL);
        inpL = cur;
        cur = build_ffn(cur,
            layer.ff_up_w, layer.ff_up_b,
            layer.ff_gate_w, layer.ff_gate_b,
            layer.ff_down_w, layer.ff_down_b,
            hparams.ffn_op, il);
        cb(cur, "ffn_out", il);
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
        cb(cur, "ffn_post_norm", il);
        cur = ggml_add(ctx0, cur, inpL);
        cb(cur, "layer_out", il);
        inpL = cur;
    }
    // remove CLS token (like build_llama4 does)
    ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
        n_embd, n_patches,
        ggml_row_size(inpL->type, n_embd), 0);
    // Multiply with mm_model_proj
    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
    // Apply layernorm, weight, bias
    cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
    // Apply GELU
    cur = ggml_gelu_inplace(ctx0, cur);
    // Branch 1: multiply with mm_h_to_4h_w
    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
    // Branch 2: multiply with mm_gate_w
    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
    // Apply silu
    gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
    // Apply mm_4h_to_h_w
    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
    // Concatenate with boi and eoi
    cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
    cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/internvl.cpp
+++ b/tools/mtmd/models/internvl.cpp
@ -0,0 +1,69 @@
 #include "models.h"
 ggml_cgraph * clip_graph_internvl::build() {
    GGML_ASSERT(model.class_embedding != nullptr);
    GGML_ASSERT(model.position_embeddings != nullptr);
    const int n_pos = n_patches + 1;
    ggml_tensor * inp = build_inp();
    // add CLS token
    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
    // The larger models use a different ViT, which uses RMS norm instead of layer norm
    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
    norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
        ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
        : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
    ggml_tensor * cur = build_vit(
                            inp, n_pos,
                            norm_t,
                            hparams.ffn_op,
                            model.position_embeddings,
                            nullptr);
    // remove CLS token
    cur = ggml_view_2d(ctx0, cur,
        n_embd, n_patches,
        ggml_row_size(cur->type, n_embd), 0);
    // pixel shuffle
    {
        const int scale_factor = model.hparams.n_merge;
        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
        const int height = n_patches_y;
        const int width  = n_patches_x;
        GGML_ASSERT(scale_factor > 0);
        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
        cur = ggml_cont_4d(ctx0, cur,
            n_embd * scale_factor * scale_factor,
            height / scale_factor,
            width / scale_factor,
            bsz);
        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
        // flatten to 2D
        cur = ggml_cont_2d(ctx0, cur,
            n_embd * scale_factor * scale_factor,
            cur->ne[1] * cur->ne[2]);
    }
    // projector (always using GELU activation)
    {
        // projector LayerNorm uses pytorch's default eps = 1e-5
        // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
        cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_3_w, model.mm_3_b,
            FFN_GELU,
            -1);
    }
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/kimivl.cpp
+++ b/tools/mtmd/models/kimivl.cpp
@ -0,0 +1,63 @@
 #include "models.h"
 ggml_cgraph * clip_graph_kimivl::build() {
    // 2D input positions
    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_h, "pos_h");
    ggml_set_input(pos_h);
    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_w, "pos_w");
    ggml_set_input(pos_w);
    ggml_tensor * learned_pos_embd = resize_position_embeddings();
    // build ViT with 2D position embeddings
    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
        // first half is X axis and second half is Y axis
        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
    };
    ggml_tensor * inp = build_inp();
    ggml_tensor * cur = build_vit(
                            inp, n_patches,
                            NORM_TYPE_NORMAL,
                            hparams.ffn_op,
                            learned_pos_embd,
                            add_pos);
    cb(cur, "vit_out", -1);
    {
        // patch_merger
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);
        // projection norm
        int proj_inp_dim = cur->ne[0];
        cur = ggml_view_2d(ctx0, cur,
            n_embd, cur->ne[1] * scale_factor * scale_factor,
            ggml_row_size(cur->type, n_embd), 0);
        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
        cur = ggml_view_2d(ctx0, cur,
            proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
            ggml_row_size(cur->type, proj_inp_dim), 0);
        cb(cur, "proj_inp_normed", -1);
        // projection mlp
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_2_w, model.mm_2_b,
            FFN_GELU,
            -1);
        cb(cur, "proj_out", -1);
    }
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/llama4.cpp
+++ b/tools/mtmd/models/llama4.cpp
@ -0,0 +1,96 @@
 #include "models.h"
 ggml_cgraph * clip_graph_llama4::build() {
    GGML_ASSERT(model.class_embedding != nullptr);
    GGML_ASSERT(model.position_embeddings != nullptr);
    const int n_pos = n_patches + 1; // +1 for [CLS]
    // 2D input positions
    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
    ggml_set_name(pos_h, "pos_h");
    ggml_set_input(pos_h);
    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
    ggml_set_name(pos_w, "pos_w");
    ggml_set_input(pos_w);
    ggml_tensor * inp = build_inp_raw();
    // Llama4UnfoldConvolution
    {
        ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
                                                patch_size, patch_size, 3, n_embd);
        inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
        cb(inp, "patch_conv", -1);
    }
    // add CLS token
    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
    // build ViT with 2D position embeddings
    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
        // first half is X axis and second half is Y axis
        // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
        // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
    };
    ggml_tensor * cur = build_vit(
                            inp, n_pos,
                            NORM_TYPE_NORMAL,
                            hparams.ffn_op,
                            model.position_embeddings,
                            add_pos);
    // remove CLS token
    cur = ggml_view_2d(ctx0, cur,
        n_embd, n_patches,
        ggml_row_size(cur->type, n_embd), 0);
    // pixel shuffle
    // based on Llama4VisionPixelShuffleMLP
    // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
    {
        const int scale_factor = model.hparams.n_merge;
        const int bsz = 1; // batch size, always 1 for now since we don't support batching
        GGML_ASSERT(scale_factor > 0);
        GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
        cur = ggml_reshape_4d(ctx0, cur,
            n_embd * scale_factor,
            n_patches_x / scale_factor,
            n_patches_y,
            bsz);
        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
        cur = ggml_cont_4d(ctx0, cur,
            n_embd * scale_factor * scale_factor,
            n_patches_x / scale_factor,
            n_patches_y / scale_factor,
            bsz);
        //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
        // flatten to 2D
        cur = ggml_cont_2d(ctx0, cur,
            n_embd * scale_factor * scale_factor,
            n_patches / scale_factor / scale_factor);
        cb(cur, "pixel_shuffle", -1);
    }
    // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
    {
        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
        cur = ggml_gelu(ctx0, cur);
        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
        cur = ggml_gelu(ctx0, cur);
        cb(cur, "adapter_mlp", -1);
    }
    // Llama4MultiModalProjector
    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
    cb(cur, "projected", -1);
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/llava.cpp
+++ b/tools/mtmd/models/llava.cpp
@ -0,0 +1,374 @@
 #include "models.h"
 // this graph is used by llava, granite and glm
 // due to having embedding_stack (used by granite), we cannot reuse build_vit
 ggml_cgraph * clip_graph_llava::build() {
    const int batch_size = 1;
    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
    // Calculate the deepest feature layer based on hparams and projector type
    int max_feature_layer = n_layer;
    {
        // Get the index of the second to last layer; this is the default for models that have a llava projector
        int il_last = hparams.n_layer - 1;
        int deepest_feature_layer = -1;
        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
            il_last += 1;
        }
        // If we set explicit vision feature layers, only go up to the deepest one
        // NOTE: only used by granite-vision models for now
        for (const auto & feature_layer : hparams.vision_feature_layer) {
            if (feature_layer > deepest_feature_layer) {
                deepest_feature_layer = feature_layer;
            }
        }
        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
    }
    ggml_tensor * inp = build_inp();
    // concat class_embeddings and patch_embeddings
    if (model.class_embedding) {
        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
    }
    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
    ggml_tensor * inpL = inp;
    // pre-layernorm
    if (model.pre_ln_w) {
        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
        cb(inpL, "pre_ln", -1);
    }
    std::vector<ggml_tensor *> embedding_stack;
    const auto & vision_feature_layer = hparams.vision_feature_layer;
    // loop over layers
    for (int il = 0; il < max_feature_layer; il++) {
        auto & layer = model.layers[il];
        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
        // If this is an embedding feature layer, save the output.
        // NOTE: 0 index here refers to the input to the encoder.
        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
            embedding_stack.push_back(cur);
        }
        // layernorm1
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
        cb(cur, "layer_inp_normed", il);
        // self-attention
        {
            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
            if (layer.q_b) {
                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
            }
            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
            if (layer.k_b) {
                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
            }
            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
            if (layer.v_b) {
                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
            }
            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            cur = build_attn(layer.o_w, layer.o_b,
                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, inpL);
        inpL = cur; // inpL = residual, cur = hidden_states
        cb(cur, "ffn_inp", il);
        // layernorm2
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
        cb(cur, "ffn_inp_normed", il);
        // ffn
        cur = build_ffn(cur,
            layer.ff_up_w, layer.ff_up_b,
            layer.ff_gate_w, layer.ff_gate_b,
            layer.ff_down_w, layer.ff_down_b,
            hparams.ffn_op, il);
        cb(cur, "ffn_out", il);
        // residual 2
        cur = ggml_add(ctx0, inpL, cur);
        cb(cur, "layer_out", il);
        inpL = cur;
    }
    // post-layernorm
    if (model.post_ln_w) {
        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
    }
    ggml_tensor * embeddings = inpL;
    // process vision feature layers (used by granite)
    {
        // final layer is a vision feature layer
        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
            embedding_stack.push_back(inpL);
        }
        // If feature layers are explicitly set, stack them (if we have multiple)
        if (!embedding_stack.empty()) {
            embeddings = embedding_stack[0];
            for (size_t i = 1; i < embedding_stack.size(); i++) {
                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
            }
        }
    }
    // llava projector (also used by granite)
    if (hparams.has_llava_projector) {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
        ggml_set_name(patches, "patches");
        ggml_set_input(patches);
        // shape [1, 576, 1024]
        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);
        // print_tensor_info(embeddings, "embeddings");
        // llava projector
        if (proj_type == PROJECTOR_TYPE_MLP) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            embeddings = ggml_gelu(ctx0, embeddings);
            if (model.mm_2_w) {
                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
            }
        }
        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
            // First LayerNorm
            embeddings = ggml_norm(ctx0, embeddings, eps);
            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
                                model.mm_1_b);
            // GELU activation
            embeddings = ggml_gelu(ctx0, embeddings);
            // Second linear layer
            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
            // Second LayerNorm
            embeddings = ggml_norm(ctx0, embeddings, eps);
            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
                                model.mm_4_b);
        }
        else if (proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projector
            int n_patch = 24;
            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
            mlp_1 = ggml_gelu(ctx0, mlp_1);
            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
            // block 1
            ggml_tensor * block_1 = nullptr;
            {
                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // hardswish
                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // residual
                block_1 = ggml_add(ctx0, mlp_3, block_1);
            }
            // block_2
            {
                // stride = 2
                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // hardswish
                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                // not sure the parameters is right for globalAvgPooling
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
            }
            embeddings = block_1;
        }
        else if (proj_type == PROJECTOR_TYPE_LDPV2)
        {
            int n_patch = 24;
            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
            mlp_0 = ggml_gelu(ctx0, mlp_0);
            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
            // mlp_2 ne = [2048, 576, 1, 1]
            // // AVG Pool Layer 2*2, strides = 2
            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
            // mlp_2 ne = [576, 2048, 1, 1]
            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
            // mlp_2 ne [24, 24, 2048, 1]
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
            embeddings = peg_0;
        }
        else {
            GGML_ABORT("fatal error");
        }
    }
    // glm projector
    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
        // GLU
        {
            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
            embeddings = ggml_norm(ctx0, embeddings, eps);
            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
            embeddings = ggml_gelu_inplace(ctx0, embeddings);
            ggml_tensor * x = embeddings;
            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
        }
        // arrangement of BOI/EOI token embeddings
        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
        {
            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
        }
    }
    else {
        GGML_ABORT("llava: unknown projector type");
    }
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    return gf;
 }
--- a/tools/mtmd/models/minicpmv.cpp
+++ b/tools/mtmd/models/minicpmv.cpp
@ -0,0 +1,114 @@
 #include "models.h"
 ggml_cgraph * clip_graph_minicpmv::build() {
    GGML_ASSERT(model.class_embedding == nullptr);
    const int n_pos       = n_patches;
    const int n_embd_proj = n_mmproj_embd;
    // position embeddings for the projector (not for ViT)
    // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
    // base frequency omega
    ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
    ggml_set_name(omega, "omega");
    ggml_set_input(omega);
    // 2D input positions (using float for sinusoidal embeddings)
    ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
    ggml_set_name(pos_h, "pos_h");
    ggml_set_input(pos_h);
    ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
    ggml_set_name(pos_w, "pos_w");
    ggml_set_input(pos_w);
    // for selecting learned pos embd, used by ViT
    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
    ggml_tensor * inp = build_inp();
    ggml_tensor * embeddings = build_vit(
                            inp, n_pos,
                            NORM_TYPE_NORMAL,
                            hparams.ffn_op,
                            learned_pos_embd,
                            nullptr);
    // resampler projector (it is just another transformer)
    ggml_tensor * q = model.mm_model_query;
    ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
    // norm
    q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
    v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
    // calculate sinusoidal pos embd
    ggml_tensor * pos_embed = nullptr;
    {
        // outer product
        ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
        ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
        ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
        // sin and cos
        ggml_tensor * pos_embd_x = ggml_concat(
            ctx0,
            ggml_sin(ctx0, theta_x),
            ggml_cos(ctx0, theta_x),
            0 // concat on first dim
        );
        ggml_tensor * pos_embd_y = ggml_concat(
            ctx0,
            ggml_sin(ctx0, theta_y),
            ggml_cos(ctx0, theta_y),
            0 // concat on first dim
        );
        pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
    }
    // k = v + pos_embed
    ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
    // attention
    {
        const int d_head = 128;
        int n_head = n_embd_proj/d_head;
        // Use actual config value if available, otherwise fall back to hardcoded values
        int num_query = hparams.minicpmv_query_num;
        ggml_tensor * Q = ggml_add(ctx0,
            ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
            model.mm_model_attn_q_b);
        ggml_tensor * K = ggml_add(ctx0,
            ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
            model.mm_model_attn_k_b);
        ggml_tensor * V = ggml_add(ctx0,
            ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
            model.mm_model_attn_v_b);
        Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
        K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
        V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
        cb(Q, "resampler_Q", -1);
        cb(K, "resampler_K", -1);
        cb(V, "resampler_V", -1);
        float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
        embeddings = build_attn(
            model.mm_model_attn_o_w,
            model.mm_model_attn_o_b,
            Q, K, V, nullptr, resampler_kq_scale, -1);
        cb(embeddings, "resampler_attn_out", -1);
    }
    // layernorm
    embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
    // projection
    embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    return gf;
 }
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -0,0 +1,58 @@
 #pragma once
 #include "../clip-graph.h"
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_pixtral : clip_graph {
    clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_qwen2vl : clip_graph {
    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_qwen3vl : clip_graph {
    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_internvl : clip_graph {
    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_llama4 : clip_graph {
    clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_kimivl : clip_graph {
    clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_cogvlm : clip_graph {
    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_llava : clip_graph {
    clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
 struct clip_graph_whisper_enc : clip_graph {
    clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
--- a/tools/mtmd/models/pixtral.cpp
+++ b/tools/mtmd/models/pixtral.cpp
@ -0,0 +1,86 @@
 #include "models.h"
 ggml_cgraph * clip_graph_pixtral::build() {
    const int n_merge = hparams.n_merge;
    // 2D input positions
    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_h, "pos_h");
    ggml_set_input(pos_h);
    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_w, "pos_w");
    ggml_set_input(pos_w);
    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
        return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
    };
    ggml_tensor * inp = build_inp();
    ggml_tensor * cur = build_vit(
                            inp, n_patches,
                            NORM_TYPE_RMS,
                            hparams.ffn_op,
                            nullptr, // no learned pos embd
                            add_pos);
    // mistral small 3.1 patch merger
    // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
    if (model.mm_patch_merger_w) {
        GGML_ASSERT(hparams.n_merge > 0);
        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
        // reshape image tokens to 2D grid
        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
        cur = ggml_cont(ctx0, cur);
        // torch.nn.functional.unfold is just an im2col under the hood
        // we just need a dummy kernel to make it work
        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
        // project to n_embd
        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
    }
    // LlavaMultiModalProjector (always using GELU activation)
    {
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_2_w, model.mm_2_b,
            FFN_GELU,
            -1);
    }
    // arrangement of the [IMG_BREAK] token
    if (model.token_embd_img_break) {
        // not efficient, but works
        // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
        const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
        const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
        const int p_total         = p_x * p_y;
        const int n_embd_text     = cur->ne[0];
        const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
        ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
        tmp = ggml_concat(ctx0, tmp, tok, 1);
        cur = ggml_view_2d(ctx0, tmp,
            n_embd_text, n_tokens_output,
            ggml_row_size(tmp->type, n_embd_text), 0);
    }
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@ -0,0 +1,183 @@
 #include "models.h"
 ggml_cgraph * clip_graph_qwen2vl::build() {
    GGML_ASSERT(model.patch_bias == nullptr);
    GGML_ASSERT(model.class_embedding == nullptr);
    const int batch_size       = 1;
    const bool use_window_attn = hparams.n_wa_pattern > 0;
    const int n_wa_pattern     = hparams.n_wa_pattern;
    const int n_pos            = n_patches;
    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
    norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
        ? NORM_TYPE_RMS // qwen 2.5 vl
        : NORM_TYPE_NORMAL; // qwen 2 vl
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
    // second conv dimension
    {
        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
        inp = ggml_add(ctx0, inp, inp_1);
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
        inp = ggml_reshape_4d(
            ctx0, inp,
            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
        inp = ggml_cont_3d(
            ctx0, inp,
            n_embd, n_patches_x * n_patches_y, batch_size);
    }
    ggml_tensor * inpL           = inp;
    ggml_tensor * window_mask    = nullptr;
    ggml_tensor * window_idx     = nullptr;
    ggml_tensor * inv_window_idx = nullptr;
    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    // pre-layernorm
    if (model.pre_ln_w) {
        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
    }
    if (use_window_attn) {
        // handle window attention inputs
        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
        ggml_set_name(inv_window_idx, "inv_window_idx");
        ggml_set_input(inv_window_idx);
        // mask for window attention
        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
        ggml_set_name(window_mask, "window_mask");
        ggml_set_input(window_mask);
        // if flash attn is used, we need to pad the mask and cast to f16
        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
        }
        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
        GGML_ASSERT(batch_size == 1);
        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
    }
    // loop over layers
    for (int il = 0; il < n_layer; il++) {
        const auto & layer = model.layers[il];
        const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
        // layernorm1
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
        cb(cur, "ln1", il);
        // self-attention
        {
            ggml_tensor * Qcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
            ggml_tensor * Kcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
            ggml_tensor * Vcur = ggml_add(ctx0,
                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            // apply M-RoPE
            Qcur = ggml_rope_multi(
                ctx0, Qcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            Kcur = ggml_rope_multi(
                ctx0, Kcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            cb(Qcur, "Qcur_rope", il);
            cb(Kcur, "Kcur_rope", il);
            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
            cur = build_attn(layer.o_w, layer.o_b,
                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
            cb(cur, "attn_out", il);
        }
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, inpL);
        inpL = cur; // inpL = residual, cur = hidden_states
        cb(cur, "ffn_inp", il);
        // layernorm2
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
        cb(cur, "ffn_inp_normed", il);
        // ffn
        cur = build_ffn(cur,
            layer.ff_up_w, layer.ff_up_b,
            layer.ff_gate_w, layer.ff_gate_b,
            layer.ff_down_w, layer.ff_down_b,
            hparams.ffn_op, il);
        cb(cur, "ffn_out", il);
        // residual 2
        cur = ggml_add(ctx0, inpL, cur);
        cb(cur, "layer_out", il);
        inpL = cur;
    }
    // post-layernorm
    if (model.post_ln_w) {
        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
    }
    // multimodal projection
    ggml_tensor * embeddings = inpL;
    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
    embeddings = build_ffn(embeddings,
                        model.mm_0_w, model.mm_0_b,
                        nullptr, nullptr,
                        model.mm_1_w, model.mm_1_b,
                        FFN_GELU,
                        -1);
    if (use_window_attn) {
        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
        ggml_set_name(window_idx, "window_idx");
        ggml_set_input(window_idx);
        // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
        GGML_ASSERT(batch_size == 1);
        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
    }
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    return gf;
 }
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@ -0,0 +1,191 @@
 #include "models.h"
 ggml_cgraph * clip_graph_qwen3vl::build() {
    GGML_ASSERT(model.patch_bias != nullptr);
    GGML_ASSERT(model.position_embeddings != nullptr);
    GGML_ASSERT(model.class_embedding == nullptr);
    const int batch_size       = 1;
    const int n_pos            = n_patches;
    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
    norm_type norm_t = NORM_TYPE_NORMAL;
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
    // second conv dimension
    {
        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
        inp = ggml_add(ctx0, inp, inp_1);
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
        inp = ggml_reshape_4d(
            ctx0, inp,
            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
        inp = ggml_cont_3d(
            ctx0, inp,
            n_embd, n_patches_x * n_patches_y, batch_size);
    }
    // add patch bias
    if (model.patch_bias != nullptr) {
        inp = ggml_add(ctx0, inp, model.patch_bias);
        cb(inp, "patch_bias", -1);
    }
    // calculate absolute position embedding and apply
    ggml_tensor * learned_pos_embd = resize_position_embeddings();
    learned_pos_embd = ggml_cont_4d(
        ctx0, learned_pos_embd,
        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
    learned_pos_embd = ggml_reshape_4d(
        ctx0, learned_pos_embd,
        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
    learned_pos_embd = ggml_cont_3d(
        ctx0, learned_pos_embd,
        n_embd, n_patches_x * n_patches_y, batch_size);
    inp = ggml_add(ctx0, inp, learned_pos_embd);
    cb(inp, "inp_pos_emb", -1);
    ggml_tensor * inpL = inp;
    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    // pre-layernorm
    if (model.pre_ln_w) {
        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
    }
    // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
    ggml_tensor * deepstack_features = nullptr;
    const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
    // loop over layers
    for (int il = 0; il < n_layer; il++) {
        auto & layer = model.layers[il];
        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
        // layernorm1
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
        cb(cur, "ln1", il);
        // self-attention
        {
            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
            cur = ggml_add(ctx0, cur, layer.qkv_b);
            ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
                    /* nb1    */ ggml_row_size(cur->type, d_head),
                    /* nb2    */ cur->nb[1],
                    /* offset */ 0);
            ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
                    /* nb1    */ ggml_row_size(cur->type, d_head),
                    /* nb2    */ cur->nb[1],
                    /* offset */ ggml_row_size(cur->type, n_embd));
            ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
                    /* nb1    */ ggml_row_size(cur->type, d_head),
                    /* nb2    */ cur->nb[1],
                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
            cb(Vcur, "Vcur", il);
            // apply M-RoPE
            Qcur = ggml_rope_multi(
                ctx0, Qcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            Kcur = ggml_rope_multi(
                ctx0, Kcur, positions, nullptr,
                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
            cb(Qcur, "Qcur_rope", il);
            cb(Kcur, "Kcur_rope", il);
            cur = build_attn(layer.o_w, layer.o_b,
                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
            cb(cur, "attn_out", il);
        }
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, inpL);
        inpL = cur; // inpL = residual, cur = hidden_states
        cb(cur, "ffn_inp", il);
        // layernorm2
        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
        cb(cur, "ffn_inp_normed", il);
        // ffn
        cur = build_ffn(cur,
            layer.ff_up_w, layer.ff_up_b,
            layer.ff_gate_w, layer.ff_gate_b,
            layer.ff_down_w, layer.ff_down_b,
            hparams.ffn_op, il);
        cb(cur, "ffn_out", il);
        // residual 2
        cur = ggml_add(ctx0, inpL, cur);
        cb(cur, "layer_out", il);
        if (layer.has_deepstack()) {
            ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
            feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
            feat = build_ffn(feat,
                layer.deepstack_fc1_w, layer.deepstack_fc1_b,
                nullptr, nullptr,
                layer.deepstack_fc2_w, layer.deepstack_fc2_b,
                ffn_op_type::FFN_GELU, il);
            if(!deepstack_features) {
                deepstack_features = feat;
            } else {
                // concat along the feature dimension
                deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
            }
        }
        inpL = cur;
    }
    // post-layernorm
    if (model.post_ln_w) {
        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
    }
    // multimodal projection
    ggml_tensor * embeddings = inpL;
    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
    embeddings = build_ffn(embeddings,
        model.mm_0_w, model.mm_0_b,
        nullptr, nullptr,
        model.mm_1_w, model.mm_1_b,
        ffn_op_type::FFN_GELU, -1);
    embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    return gf;
 }
--- a/tools/mtmd/models/siglip.cpp
+++ b/tools/mtmd/models/siglip.cpp
@ -0,0 +1,81 @@
 #include "models.h"
 ggml_cgraph * clip_graph_siglip::build() {
    ggml_tensor * inp = build_inp();
    ggml_tensor * learned_pos_embd = model.position_embeddings;
    if (proj_type == PROJECTOR_TYPE_LFM2) {
        learned_pos_embd = resize_position_embeddings();
    }
    ggml_tensor * cur = build_vit(
                            inp, n_patches,
                            NORM_TYPE_NORMAL,
                            hparams.ffn_op,
                            learned_pos_embd,
                            nullptr);
    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
        const int batch_size = 1;
        GGML_ASSERT(n_patches_x == n_patches_y);
        const int patches_per_image = n_patches_x;
        const int kernel_size = hparams.n_merge;
        cur = ggml_transpose(ctx0, cur);
        cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
        // doing a pool2d to reduce the number of output tokens
        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
        // apply norm before projection
        cur = ggml_rms_norm(ctx0, cur, eps);
        cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
        // apply projection
        cur = ggml_mul_mat(ctx0,
            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
            cur);
    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
        // pixel_shuffle
        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);
        cur = ggml_mul_mat(ctx0, model.projection, cur);
    } else if (proj_type == PROJECTOR_TYPE_LFM2) {
        // pixel unshuffle block
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);
        // projection
        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_2_w, model.mm_2_b,
            FFN_GELU,
            -1);
    } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
        cur = build_ffn(cur,
            model.mm_0_w, model.mm_0_b,
            nullptr, nullptr,
            model.mm_1_w, model.mm_1_b,
            hparams.ffn_op,
            -1);
    } else {
        GGML_ABORT("SigLIP: Unsupported projector type");
    }
    // build the graph
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/models/whisper-enc.cpp
+++ b/tools/mtmd/models/whisper-enc.cpp
@ -0,0 +1,107 @@
 #include "models.h"
 ggml_cgraph * clip_graph_whisper_enc::build() {
    const int n_frames = img.nx;
    const int n_pos    = n_frames / 2;
    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
    ggml_tensor * inp = build_inp_raw(1);
    // conv1d block
    {
        // convolution + gelu
        ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
        cur = ggml_add(ctx0, cur, model.conv1d_1_b);
        cur = ggml_gelu_erf(ctx0, cur);
        cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
        cur = ggml_add(ctx0, cur, model.conv1d_2_b);
        cur = ggml_gelu_erf(ctx0, cur);
        // transpose
        inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
        cb(inp, "after_conv1d", -1);
    }
    // sanity check (only check one layer, but it should be the same for all)
    GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
    GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
    GGML_ASSERT(model.layers[0].q_b);
    GGML_ASSERT(model.layers[0].v_b);
    GGML_ASSERT(!model.layers[0].k_b); // no bias for k
    GGML_ASSERT(model.post_ln_w && model.post_ln_b);
    ggml_tensor * pos_embd_selected = ggml_view_2d(
        ctx0, model.position_embeddings,
        model.position_embeddings->ne[0], n_pos,
        model.position_embeddings->nb[1], 0
    );
    ggml_tensor * cur = build_vit(
                            inp, n_pos,
                            NORM_TYPE_NORMAL,
                            hparams.ffn_op,
                            pos_embd_selected,
                            nullptr);
    cb(cur, "after_transformer", -1);
    if (model.audio_has_stack_frames()) {
        // StackAudioFrames
        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
        int64_t stride = n_embd * hparams.proj_stack_factor;
        int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
        int64_t pad = padded_len - ggml_nelements(cur);
        if (pad > 0) {
            cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
            cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
        }
        cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
                            ggml_row_size(cur->type, stride), 0);
        cb(cur, "after_stacked", -1);
    }
    if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
        // UltravoxProjector
        // pre-norm
        cur = ggml_rms_norm(ctx0, cur, 1e-6);
        cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
        // ffn in
        cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
        // swiglu
        // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
        cur = ggml_swiglu_swapped(ctx0, cur);
        // mid-norm
        cur = ggml_rms_norm(ctx0, cur, 1e-6);
        cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
        // ffn out
        cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
    } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
        // projector
        cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
        cur = ggml_add(ctx0, cur, model.mm_fc_b);
    } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
        // projector
        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
            nullptr, nullptr,
            model.mm_2_w, model.mm_2_b,
            FFN_GELU_ERF,
            -1);
    } else {
        GGML_ABORT("%s: unknown projector type", __func__);
    }
    cb(cur, "projected", -1);
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@ -6,6 +6,8 @@
 #include <vector>
 #include <string>
 #define MTMD_INTERNAL_HEADER
 #define WHISPER_ASSERT GGML_ASSERT
 #define WHISPER_SAMPLE_RATE 16000
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -32,6 +32,10 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"
 #ifdef MTMD_INTERNAL_HEADER
 #error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
 #endif
 //
 // internal logging functions
 //
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -22,6 +22,11 @@
 *          Issues related to API usage may receive lower priority support.
 *
 * For the usage, see an example in mtmd-cli.cpp
 *
 * For contributors:
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
 */
 #ifdef LLAMA_SHARED
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
 | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
 | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
 | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
 | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
 | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
-| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
-| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
 | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
 | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
 | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | -------- | ----------- |
 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
 | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
 | `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
-| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
+| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
-| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
-| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
 | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
 | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
 | `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
-| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
-| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
 | `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
-| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
 - `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
 - `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
 - If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
 Example usage of docker compose with environment variables:
 ```yml
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -16,6 +16,7 @@
 #include <atomic>
 #include <chrono>
 #include <queue>
 #include <filesystem>
 #ifdef _WIN32
 #include <winsock2.h>
@ -171,7 +172,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
    }
    // read base args from router's argv
-    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
+    common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
    // remove any router-controlled args from base_args
    for (const auto & cargs : control_args) {
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@ -41,7 +41,7 @@
 				"@tailwindcss/vite": "^4.0.0",
 				"@types/node": "^22",
 				"@vitest/browser": "^3.2.3",
-				"bits-ui": "^2.8.11",
+				"bits-ui": "^2.14.4",
 				"clsx": "^2.1.1",
 				"dexie": "^4.0.11",
 				"eslint": "^9.18.0",
@ -3343,17 +3343,17 @@
 			}
 		},
 		"node_modules/bits-ui": {
-			"version": "2.8.11",
+			"version": "2.14.4",
-			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
-			"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"@floating-ui/core": "^1.7.1",
 				"@floating-ui/dom": "^1.7.1",
 				"esm-env": "^1.1.2",
-				"runed": "^0.29.1",
+				"runed": "^0.35.1",
-				"svelte-toolbelt": "^0.9.3",
+				"svelte-toolbelt": "^0.10.6",
 				"tabbable": "^6.2.0"
 			},
 			"engines": {
@ -3368,9 +3368,9 @@
 			}
 		},
 		"node_modules/bits-ui/node_modules/runed": {
-			"version": "0.29.2",
+			"version": "0.35.1",
-			"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
-			"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte",
@ -3378,23 +3378,31 @@
 			],
 			"license": "MIT",
 			"dependencies": {
-				"esm-env": "^1.0.0"
+				"dequal": "^2.0.3",
 				"esm-env": "^1.0.0",
 				"lz-string": "^1.5.0"
 			},
 			"peerDependencies": {
 				"@sveltejs/kit": "^2.21.0",
 				"svelte": "^5.7.0"
 			},
 			"peerDependenciesMeta": {
 				"@sveltejs/kit": {
 					"optional": true
 				}
 			}
 		},
 		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
-			"version": "0.9.3",
+			"version": "0.10.6",
-			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
-			"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte"
 			],
 			"dependencies": {
 				"clsx": "^2.1.1",
-				"runed": "^0.29.0",
+				"runed": "^0.35.1",
 				"style-to-object": "^1.0.8"
 			},
 			"engines": {
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@ -43,7 +43,7 @@
 		"@tailwindcss/vite": "^4.0.0",
 		"@types/node": "^22",
 		"@vitest/browser": "^3.2.3",
-		"bits-ui": "^2.8.11",
+		"bits-ui": "^2.14.4",
 		"clsx": "^2.1.1",
 		"dexie": "^4.0.11",
 		"eslint": "^9.18.0",
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@ -331,6 +331,7 @@
 	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
 		? 'cursor-not-allowed opacity-60'
 		: ''} {className}"
 	data-slot="chat-form"
 >
 	<ChatAttachmentsList
 		bind:uploadedFiles
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
@ -1,6 +1,5 @@
 <script lang="ts">
-	import { Input } from '$lib/components/ui/input';
+	import { SearchInput } from '$lib/components/app';
 	import { Search } from '@lucide/svelte';
 	interface Props {
 		value?: string;
@ -15,19 +14,6 @@
 		onInput,
 		class: className
 	}: Props = $props();
 	function handleInput(event: Event) {
 		const target = event.target as HTMLInputElement;
 		value = target.value;
 		onInput?.(target.value);
 	}
 </script>
-<div class="relative mb-4 {className}">
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
 	<Search
 		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
 	/>
 	<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
 </div>
--- a/tools/server/webui/src/lib/components/app/index.ts
+++ b/tools/server/webui/src/lib/components/app/index.ts
@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
 export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
 export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
 export { default as RemoveButton } from './misc/RemoveButton.svelte';
 export { default as SearchInput } from './misc/SearchInput.svelte';
 export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
 export { default as ModelsSelector } from './models/ModelsSelector.svelte';
--- a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
@ -0,0 +1,73 @@
 <script lang="ts">
 	import { Input } from '$lib/components/ui/input';
 	import { Search, X } from '@lucide/svelte';
 	interface Props {
 		value?: string;
 		placeholder?: string;
 		onInput?: (value: string) => void;
 		onClose?: () => void;
 		onKeyDown?: (event: KeyboardEvent) => void;
 		class?: string;
 		id?: string;
 		ref?: HTMLInputElement | null;
 	}
 	let {
 		value = $bindable(''),
 		placeholder = 'Search...',
 		onInput,
 		onClose,
 		onKeyDown,
 		class: className,
 		id,
 		ref = $bindable(null)
 	}: Props = $props();
 	let showClearButton = $derived(!!value || !!onClose);
 	function handleInput(event: Event) {
 		const target = event.target as HTMLInputElement;
 		value = target.value;
 		onInput?.(target.value);
 	}
 	function handleClear() {
 		if (value) {
 			value = '';
 			onInput?.('');
 			ref?.focus();
 		} else {
 			onClose?.();
 		}
 	}
 </script>
 <div class="relative {className}">
 	<Search
 		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
 	/>
 	<Input
 		{id}
 		bind:value
 		bind:ref
 		class="pl-9 {showClearButton ? 'pr-9' : ''}"
 		oninput={handleInput}
 		onkeydown={onKeyDown}
 		{placeholder}
 		type="search"
 	/>
 	{#if showClearButton}
 		<button
 			type="button"
 			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
 			onclick={handleClear}
 			aria-label={value ? 'Clear search' : 'Close'}
 		>
 			<X class="h-4 w-4" />
 		</button>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@ -2,8 +2,8 @@
 	import { onMount, tick } from 'svelte';
 	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import * as Popover from '$lib/components/ui/popover';
 	import { cn } from '$lib/components/ui/utils';
 	import { portalToBody } from '$lib/utils';
 	import {
 		modelsStore,
 		modelOptions,
@ -17,12 +17,8 @@
 	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
 	import { ServerModelStatus } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
-	import { DialogModelInformation } from '$lib/components/app';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
-	import {
+	import type { ModelOption } from '$lib/types/models';
 		MENU_MAX_WIDTH,
 		MENU_OFFSET,
 		VIEWPORT_GUTTER
 	} from '$lib/constants/floating-ui-constraints';
 	interface Props {
 		class?: string;
@ -145,185 +141,126 @@
 		return options.some((option) => option.model === currentModel);
 	});
-	let isOpen = $state(false);
+	let searchTerm = $state('');
-	let showModelDialog = $state(false);
+	let searchInputRef = $state<HTMLInputElement | null>(null);
-	let container: HTMLDivElement | null = null;
+	let highlightedIndex = $state<number>(-1);
 	let menuRef = $state<HTMLDivElement | null>(null);
 	let triggerButton = $state<HTMLButtonElement | null>(null);
 	let menuPosition = $state<{
 		top: number;
 		left: number;
 		width: number;
 		placement: 'top' | 'bottom';
 		maxHeight: number;
 	} | null>(null);
-	onMount(async () => {
+	let filteredOptions: ModelOption[] = $derived(
-		try {
+		(() => {
-			await modelsStore.fetch();
+			const term = searchTerm.trim().toLowerCase();
-		} catch (error) {
+			if (!term) return options;
-			console.error('Unable to load models:', error);
+
-		}
+			return options.filter(
 				(option) =>
 					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
 			);
 		})()
 	);
 	// Get indices of compatible options for keyboard navigation
 	let compatibleIndices = $derived(
 		filteredOptions
 			.map((option, index) => (isModelCompatible(option) ? index : -1))
 			.filter((i) => i !== -1)
 	);
 	// Reset highlighted index when search term changes
 	$effect(() => {
 		void searchTerm;
 		highlightedIndex = -1;
 	});
-	function toggleOpen() {
+	let isOpen = $state(false);
 	let showModelDialog = $state(false);
 	onMount(() => {
 		modelsStore.fetch().catch((error) => {
 			console.error('Unable to load models:', error);
 		});
 	});
 	function handleOpenChange(open: boolean) {
 		if (loading || updating) return;
-		if (isRouter) {
+		if (open) {
-			// Router mode: show dropdown
+			isOpen = true;
-			if (isOpen) {
+			searchTerm = '';
-				closeMenu();
+			highlightedIndex = -1;
-			} else {
+
-				openMenu();
+			// Focus search input after popover opens
 			tick().then(() => {
 				requestAnimationFrame(() => searchInputRef?.focus());
 			});
 			if (isRouter) {
 				modelsStore.fetchRouterModels().then(() => {
 					modelsStore.fetchModalitiesForLoadedModels();
 				});
 			}
 		} else {
-			// Single model mode: show dialog
+			isOpen = false;
-			showModelDialog = true;
+			searchTerm = '';
 			highlightedIndex = -1;
 		}
 	}
-	async function openMenu() {
+	function handleTriggerClick() {
 		if (loading || updating) return;
-		isOpen = true;
+		if (!isRouter) {
-		await tick();
+			// Single model mode: show dialog instead of popover
-		updateMenuPosition();
+			showModelDialog = true;
 		requestAnimationFrame(() => updateMenuPosition());
 		if (isRouter) {
 			modelsStore.fetchRouterModels().then(() => {
 				modelsStore.fetchModalitiesForLoadedModels();
 			});
 		}
 		// For router mode, the Popover handles open/close
 	}
 	export function open() {
 		if (isRouter) {
-			openMenu();
+			handleOpenChange(true);
 		} else {
 			showModelDialog = true;
 		}
 	}
 	function closeMenu() {
-		if (!isOpen) return;
+		handleOpenChange(false);
 		isOpen = false;
 		menuPosition = null;
 	}
-	function handlePointerDown(event: PointerEvent) {
+	function handleSearchKeyDown(event: KeyboardEvent) {
-		if (!container) return;
+		if (event.isComposing) return;
-		const target = event.target as Node | null;
+		if (event.key === 'ArrowDown') {
 			event.preventDefault();
 			if (compatibleIndices.length === 0) return;
-		if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
-			closeMenu();
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
-		}
+				highlightedIndex = compatibleIndices[0];
 	}
 	function handleKeydown(event: KeyboardEvent) {
 		if (event.key === 'Escape') {
 			closeMenu();
 		}
 	}
 	function handleResize() {
 		if (isOpen) {
 			updateMenuPosition();
 		}
 	}
 	function updateMenuPosition() {
 		if (!isOpen || !triggerButton || !menuRef) return;
 		const triggerRect = triggerButton.getBoundingClientRect();
 		const viewportWidth = window.innerWidth;
 		const viewportHeight = window.innerHeight;
 		if (viewportWidth === 0 || viewportHeight === 0) return;
 		const scrollWidth = menuRef.scrollWidth;
 		const scrollHeight = menuRef.scrollHeight;
 		const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
 		const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
 		const safeMaxWidth =
 			constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
 		const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
 		let width = Math.min(
 			Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
 			safeMaxWidth || 320
 		);
 		const availableBelow = Math.max(
 			0,
 			viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
 		);
 		const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
 		const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
 		const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
 		function computePlacement(placement: 'top' | 'bottom') {
 			const available = placement === 'bottom' ? availableBelow : availableAbove;
 			const allowedHeight =
 				available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
 			const maxHeight = Math.min(scrollHeight, allowedHeight);
 			const height = Math.max(0, maxHeight);
 			let top: number;
 			if (placement === 'bottom') {
 				const rawTop = triggerRect.bottom + MENU_OFFSET;
 				const minTop = VIEWPORT_GUTTER;
 				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
 				if (maxTop < minTop) {
 					top = minTop;
 				} else {
 					top = Math.min(Math.max(rawTop, minTop), maxTop);
 				}
 			} else {
-				const rawTop = triggerRect.top - MENU_OFFSET - height;
+				highlightedIndex = compatibleIndices[currentPos + 1];
-				const minTop = VIEWPORT_GUTTER;
+			}
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
+		} else if (event.key === 'ArrowUp') {
-				if (maxTop < minTop) {
+			event.preventDefault();
-					top = minTop;
+			if (compatibleIndices.length === 0) return;
-				} else {
+
-					top = Math.max(Math.min(rawTop, maxTop), minTop);
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
 			if (currentPos === -1 || currentPos === 0) {
 				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
 			} else {
 				highlightedIndex = compatibleIndices[currentPos - 1];
 			}
 		} else if (event.key === 'Enter') {
 			event.preventDefault();
 			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
 				const option = filteredOptions[highlightedIndex];
 				if (isModelCompatible(option)) {
 					handleSelect(option.id);
 				}
-			}
+			} else if (compatibleIndices.length > 0) {
-
+				// No selection - highlight first compatible option
-			return { placement, top, height, maxHeight };
+				highlightedIndex = compatibleIndices[0];
 		}
 		const belowMetrics = computePlacement('bottom');
 		const aboveMetrics = computePlacement('top');
 		let metrics = belowMetrics;
 		if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
 			metrics = aboveMetrics;
 		}
 		let left = triggerRect.right - width;
 		const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
 		if (maxLeft < VIEWPORT_GUTTER) {
 			left = VIEWPORT_GUTTER;
 		} else {
 			if (left > maxLeft) {
 				left = maxLeft;
 			}
 			if (left < VIEWPORT_GUTTER) {
 				left = VIEWPORT_GUTTER;
 			}
 		}
 		menuPosition = {
 			top: Math.round(metrics.top),
 			left: Math.round(left),
 			width: Math.round(width),
 			placement: metrics.placement,
 			maxHeight: Math.round(metrics.maxHeight)
 		};
 	}
 	async function handleSelect(modelId: string) {
@ -356,6 +293,14 @@
 		if (shouldCloseMenu) {
 			closeMenu();
 			// Focus the chat textarea after model selection
 			requestAnimationFrame(() => {
 				const textarea = document.querySelector<HTMLTextAreaElement>(
 					'[data-slot="chat-form"] textarea'
 				);
 				textarea?.focus();
 			});
 		}
 	}
@ -404,10 +349,7 @@
 	}
 </script>
-<svelte:window onresize={handleResize} />
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
 <svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
 <div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
@ -418,9 +360,8 @@
 	{:else}
 		{@const selectedOption = getDisplayOption()}
-		<div class="relative">
+		<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
-			<button
+			<Popover.Trigger
 				type="button"
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
 					!isCurrentModelInCache()
@ -430,15 +371,11 @@
 							: isHighlightedCurrentModelActive
 								? 'text-foreground'
 								: 'text-muted-foreground',
-					isOpen ? 'text-foreground' : '',
+					isOpen ? 'text-foreground' : ''
 					className
 				)}
 				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
-				aria-haspopup={isRouter ? 'listbox' : undefined}
+				onclick={handleTriggerClick}
-				aria-expanded={isRouter ? isOpen : undefined}
+				disabled={disabled || updating || !isRouter}
 				onclick={toggleOpen}
 				bind:this={triggerButton}
 				disabled={disabled || updating}
 			>
 				<Package class="h-3.5 w-3.5" />
@ -451,33 +388,35 @@
 				{:else if isRouter}
 					<ChevronDown class="h-3 w-3.5" />
 				{/if}
-			</button>
+			</Popover.Trigger>
-			{#if isOpen && isRouter}
+			<Popover.Content
-				<div
+				class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
-					bind:this={menuRef}
+				align="end"
-					use:portalToBody
+				sideOffset={8}
-					class={cn(
+				collisionPadding={16}
-						'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
+			>
-						menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
+				<div class="flex max-h-[50dvh] flex-col overflow-hidden">
 					)}
 					role="listbox"
 					style:top={menuPosition ? `${menuPosition.top}px` : undefined}
 					style:left={menuPosition ? `${menuPosition.left}px` : undefined}
 					style:width={menuPosition ? `${menuPosition.width}px` : undefined}
 					data-placement={menuPosition?.placement ?? 'bottom'}
 				>
 					<div
-						class="overflow-y-auto py-1"
+						class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
-						style:max-height={menuPosition && menuPosition.maxHeight > 0
+					>
-							? `${menuPosition.maxHeight}px`
+						<SearchInput
-							: undefined}
+							id="model-search"
 							placeholder="Search models..."
 							bind:value={searchTerm}
 							bind:ref={searchInputRef}
 							onClose={closeMenu}
 							onKeyDown={handleSearchKeyDown}
 						/>
 					</div>
 					<div
 						class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
 					>
 						{#if !isCurrentModelInCache() && currentModel}
 							<!-- Show unavailable model as first option (disabled) -->
 							<button
 								type="button"
-								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
+								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
 								role="option"
 								aria-selected="true"
 								aria-disabled="true"
@ -488,20 +427,25 @@
 							</button>
 							<div class="my-1 h-px bg-border"></div>
 						{/if}
-						{#each options as option (option.id)}
+						{#if filteredOptions.length === 0}
 							<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
 						{/if}
 						{#each filteredOptions as option, index (option.id)}
 							{@const status = getModelStatus(option.model)}
 							{@const isLoaded = status === ServerModelStatus.LOADED}
 							{@const isLoading = status === ServerModelStatus.LOADING}
 							{@const isSelected = currentModel === option.model || activeId === option.id}
 							{@const isCompatible = isModelCompatible(option)}
 							{@const isHighlighted = index === highlightedIndex}
 							{@const missingModalities = getMissingModalities(option)}
 							<div
 								class={cn(
-									'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
+									'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
 									isCompatible
 										? 'cursor-pointer hover:bg-muted focus:bg-muted'
 										: 'cursor-not-allowed opacity-50',
-									isSelected
+									isSelected || isHighlighted
 										? 'bg-accent text-accent-foreground'
 										: isCompatible
 											? 'hover:bg-accent hover:text-accent-foreground'
@ -509,10 +453,11 @@
 									isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
 								)}
 								role="option"
-								aria-selected={isSelected}
+								aria-selected={isSelected || isHighlighted}
 								aria-disabled={!isCompatible}
 								tabindex={isCompatible ? 0 : -1}
 								onclick={() => isCompatible && handleSelect(option.id)}
 								onmouseenter={() => (highlightedIndex = index)}
 								onkeydown={(e) => {
 									if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
 										e.preventDefault();
@ -586,8 +531,8 @@
 						{/each}
 					</div>
 				</div>
-			{/if}
+			</Popover.Content>
-		</div>
+		</Popover.Root>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/ui/popover/index.ts
+++ b/tools/server/webui/src/lib/components/ui/popover/index.ts
@ -0,0 +1,19 @@
 import Root from './popover.svelte';
 import Close from './popover-close.svelte';
 import Content from './popover-content.svelte';
 import Trigger from './popover-trigger.svelte';
 import Portal from './popover-portal.svelte';
 export {
 	Root,
 	Content,
 	Trigger,
 	Close,
 	Portal,
 	//
 	Root as Popover,
 	Content as PopoverContent,
 	Trigger as PopoverTrigger,
 	Close as PopoverClose,
 	Portal as PopoverPortal
 };
--- a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
@ -0,0 +1,7 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
 </script>
 <PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
--- a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
@ -0,0 +1,37 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	import PopoverPortal from './popover-portal.svelte';
 	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
 	import type { ComponentProps } from 'svelte';
 	let {
 		ref = $bindable(null),
 		class: className,
 		sideOffset = 4,
 		side,
 		align = 'center',
 		collisionPadding = 8,
 		avoidCollisions = true,
 		portalProps,
 		...restProps
 	}: PopoverPrimitive.ContentProps & {
 		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
 	} = $props();
 </script>
 <PopoverPortal {...portalProps}>
 	<PopoverPrimitive.Content
 		bind:ref
 		data-slot="popover-content"
 		{sideOffset}
 		{side}
 		{align}
 		{collisionPadding}
 		{avoidCollisions}
 		class={cn(
 			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
 			className
 		)}
 		{...restProps}
 	/>
 </PopoverPortal>
--- a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
@ -0,0 +1,7 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
 </script>
 <PopoverPrimitive.Portal {...restProps} />
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`55bc9320a4aae82af18e23eefd5de319a755d7b9`	`130bc125a88bb57664b88932c48c38a1cb316fac`