diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile
index cd8f87b2ea..83182c9700 100644
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -4,7 +4,7 @@
 
 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
 
 # ==============================================================================
 # BUILD STAGE
@@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 ENTRYPOINT [ "/app/llama-cli" ]
 
diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
index 6e16ecda44..b9e84ab986 100644
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
index 54f793d0a3..fed5863157 100644
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index d1a8fbed4c..adebf08229 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light
 
 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index faa3500e61..34d6ad9f40 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index d6bf28b105..53c3ed8d88 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile
index b7c9457680..1e66f061d5 100644
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
 
 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
 
 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
 
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 8a3a693400..cc5ee17dfd 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
     exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
     exec ./llama-cli "$@"
+elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
+    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
     exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
-    echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --run (-r): Run a model (chat) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
+    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
     echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
     echo "              ex: -m model.gguf"
     echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index fd7195c5be..b37b4f277d 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
 
 WORKDIR /app
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 383427f36f..9fe1401df4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1400,25 +1400,54 @@ jobs:
         chip_type: ['910b', '310p']
         build: ['Release']
     runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
-      - name: Dependencies
+      - name: Free up disk space
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Set container image
+        id: cann-image
         run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull container image
+        run: docker pull "${{ steps.cann-image.outputs.image }}"
 
       - name: Build
+        env:
+          BUILD_TYPE: ${{ matrix.build }}
+          SOC_TYPE: ascend${{ matrix.chip_type }}
         run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+          HOST_UID=$(id -u)
+          HOST_GID=$(id -g)
 
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=ascend${{ matrix.chip_type }}
-          cmake --build build -j $(nproc)
+          docker run --rm \
+            -v "${PWD}:/workspace" \
+            -w /workspace \
+            -e SOC_TYPE=${SOC_TYPE} \
+            -e BUILD_TYPE=${BUILD_TYPE} \
+            "${{ steps.cann-image.outputs.image }}" \
+            bash -lc '
+              set -e
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+              yum clean all && rm -rf /var/cache/yum
+              git config --global --add safe.directory "/workspace"
+              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+              cmake -S . -B build \
+                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+                  -DGGML_CANN=on \
+                  -DSOC_TYPE=${SOC_TYPE}
+              cmake --build build -j $(nproc)
+
+              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+            '
 
 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 77aec20c11..446cae9f84 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -731,6 +731,78 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
           name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
 
+
+  openEuler-cann:
+    strategy:
+      matrix:
+        arch: [x86, aarch64]
+        chip_type: ['910b', '310p']
+        build: ['Release']
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free up disk space
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Set container image
+        id: cann-image
+        run: |
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull container image
+        run: docker pull "${{ steps.cann-image.outputs.image }}"
+
+      - name: Build
+        env:
+          BUILD_TYPE: ${{ matrix.build }}
+          SOC_TYPE: ascend${{ matrix.chip_type }}
+        run: |
+          HOST_UID=$(id -u)
+          HOST_GID=$(id -g)
+
+          docker run --rm \
+            -v "${PWD}:/workspace" \
+            -w /workspace \
+            -e SOC_TYPE=${SOC_TYPE} \
+            -e BUILD_TYPE=${BUILD_TYPE} \
+            "${{ steps.cann-image.outputs.image }}" \
+            bash -lc '
+              set -e
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+              yum clean all && rm -rf /var/cache/yum
+              git config --global --add safe.directory "/workspace"
+              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+              cmake -S . -B build \
+                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+                  -DGGML_CANN=on \
+                  -DSOC_TYPE=${SOC_TYPE}
+              cmake --build build -j $(nproc)
+
+              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+            '
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts (tar)
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
@@ -752,6 +824,7 @@ jobs:
       - macOS-arm64
       - macOS-x64
       - ios-xcode-build
+      - openEuler-cann
 
     steps:
       - name: Clone
@@ -844,6 +917,12 @@ jobs:
             - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
             - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
 
+            **openEuler:**
+            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
+            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
+            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
+            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
+
       - name: Upload release
         id: upload_release
         uses: actions/github-script@v3
diff --git a/common/arg.cpp b/common/arg.cpp
index a31dcbc689..19f22f883f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
 
 bool common_arg::get_value_from_env(std::string & output) const {
     if (env == nullptr) return false;
+    if (!args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        char * neg_value = std::getenv(neg_env.c_str());
+        if (neg_value) {
+            output = "0"; // falsey
+            return true;
+        }
+    }
     char * value = std::getenv(env);
     if (value) {
         output = value;
@@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
 }
 
 bool common_arg::has_value_from_env() const {
+    if (env != nullptr && !args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        if (std::getenv(neg_env.c_str())) {
+            return true;
+        }
+    }
     return env != nullptr && std::getenv(env);
 }
 
@@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
     std::string leading_spaces(n_leading_spaces, ' ');
 
     std::ostringstream ss;
-    for (const auto arg : args) {
-        if (arg == args.front()) {
-            if (args.size() == 1) {
+    auto all_args = get_args(); // also contains args_neg
+    for (const auto & arg : all_args) {
+        if (arg == all_args.front()) {
+            if (all_args.size() == 1) {
                 ss << arg;
             } else {
                 // first arg is usually abbreviation, we need padding to make it more beautiful
@@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
                 ss << tmp << spaces;
             }
         } else {
-            ss << arg << (arg != args.back() ? ", " : "");
+            ss << arg << (arg != all_args.back() ? ", " : "");
         }
     }
     if (value_hint) ss << " " << value_hint;
@@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
     return ss.str();
 }
 
+std::vector<std::string> common_arg::get_args() const {
+    std::vector<std::string> result;
+    for (const auto & arg : args) {
+        result.push_back(std::string(arg));
+    }
+    for (const auto & arg : args_neg) {
+        result.push_back(std::string(arg));
+    }
+    return result;
+}
+
+std::vector<std::string> common_arg::get_env() const {
+    std::vector<std::string> result;
+    if (env) {
+        result.push_back(std::string(env));
+    }
+    if (!args_neg.empty() && env) {
+        // for compatibility, we need to add LLAMA_ARG_NO_ variant
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        result.push_back(neg_env);
+    }
+    return result;
+}
+
 //
 // utils
 //
@@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
     return msg.str();
 }
 
+static bool parse_bool_value(const std::string & value) {
+    if (is_truthy(value)) {
+        return true;
+    } else if (is_falsey(value)) {
+        return false;
+    } else {
+        throw std::invalid_argument("invalid boolean value");
+    }
+}
+
 //
 // CLI argument parsing functions
 //
@@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
     common_params & params = ctx_arg.params;
 
-    std::unordered_map<std::string, common_arg *> arg_to_options;
+    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
     for (auto & opt : ctx_arg.options) {
         for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
+            arg_to_options[arg] = {&opt, /* is_positive */ true};
+        }
+        for (const auto & arg : opt.args_neg) {
+            arg_to_options[arg] = {&opt, /* is_positive */ false};
         }
     }
 
@@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         std::string value;
         if (opt.get_value_from_env(value)) {
             try {
-                if (opt.handler_void && (value == "1" || value == "true")) {
+                if (opt.handler_void && is_truthy(value)) {
                     opt.handler_void(params);
                 }
                 if (opt.handler_int) {
                     opt.handler_int(params, std::stoi(value));
                 }
+                if (opt.handler_bool) {
+                    opt.handler_bool(params, parse_bool_value(value));
+                }
                 if (opt.handler_string) {
                     opt.handler_string(params, value);
                     continue;
@@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         if (arg_to_options.find(arg) == arg_to_options.end()) {
             throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
         }
-        auto opt = *arg_to_options[arg];
+        auto & tmp = arg_to_options[arg];
+        auto opt = *tmp.first;
+        bool is_positive = tmp.second;
         if (opt.has_value_from_env()) {
             fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
         }
@@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                 opt.handler_void(params);
                 continue;
             }
+            if (opt.handler_bool) {
+                opt.handler_bool(params, is_positive);
+                continue;
+            }
 
             // arg with single value
             check_arg(i);
@@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
             throw std::invalid_argument(string_format(
                 "error while handling argument \"%s\": %s\n\n"
                 "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+                arg.c_str(), e.what(), opt.to_string().c_str()));
         }
     }
 
@@ -438,7 +504,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // model is required (except for server)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -573,6 +639,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-batched-bench",
         "llama-bench",
         "llama-cli",
+        "llama-completion",
         "llama-convert-llama2c-to-ggml",
         "llama-cvector-generator",
         "llama-embedding",
@@ -750,11 +817,11 @@ static std::string list_builtin_chat_templates() {
 }
 
 bool common_arg_utils::is_truthy(const std::string & value) {
-    return value == "on" || value == "enabled" || value == "1";
+    return value == "on" || value == "enabled" || value == "true" || value == "1";
 }
 
 bool common_arg_utils::is_falsey(const std::string & value) {
-    return value == "off" || value == "disabled" || value == "0";
+    return value == "off" || value == "disabled" || value == "false" || value == "0";
 }
 
 bool common_arg_utils::is_autoy(const std::string & value) {
@@ -839,10 +906,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ));
     add_opt(common_arg(
+        {"--display-prompt"},
         {"--no-display-prompt"},
-        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](common_params & params) {
-            params.display_prompt = false;
+        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.display_prompt = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
@@ -1055,18 +1123,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.kv_unified = true;
         }
     ).set_env("LLAMA_ARG_KV_UNIFIED"));
-    add_opt(common_arg(
-        {"--no-context-shift"},
-        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.ctx_shift = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.ctx_shift = true;
+        {"--no-context-shift"},
+        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.ctx_shift = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
     add_opt(common_arg(
@@ -1106,20 +1168,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
+        {"--perf"},
         {"--no-perf"},
-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params) {
-            params.no_perf = true;
-            params.sampling.no_perf = true;
+        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.no_perf = !value;
+            params.sampling.no_perf = !value;
         }
-    ).set_env("LLAMA_ARG_NO_PERF"));
+    ).set_env("LLAMA_ARG_PERF"));
     add_opt(common_arg(
+        {"--show-timings"},
         {"--no-show-timings"},
-        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
-        [](common_params & params) {
-            params.show_timings = false;
+        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.show_timings = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
@@ -1171,16 +1235,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-e", "--escape"},
-        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params) {
-            params.escape = true;
-        }
-    ));
-    add_opt(common_arg(
         {"--no-escape"},
-        "do not process escape sequences",
-        [](common_params & params) {
-            params.escape = false;
+        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.escape = value;
         }
     ));
     add_opt(common_arg(
@@ -1227,19 +1285,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cnv", "--conversation"},
-        "run in conversation mode:\n"
+        {"-no-cnv", "--no-conversation"},
+        "whether to run in conversation mode:\n"
         "- does not print special tokens and suffix/prefix\n"
         "- interactive mode is also enabled\n"
         "(default: auto enabled if chat template is available)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
-    add_opt(common_arg(
-        {"-no-cnv", "--no-conversation"},
-        "force disable conversation mode (default: false)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        [](common_params & params, bool value) {
+            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
@@ -1297,10 +1349,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
+        {"--warmup"},
         {"--no-warmup"},
-        "skip warming up the model with an empty run",
-        [](common_params & params) {
-            params.warmup = false;
+        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.warmup = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
@@ -1702,19 +1755,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
+        {"-kvo", "--kv-offload"},
         {"-nkvo", "--no-kv-offload"},
-        "disable KV offload",
-        [](common_params & params) {
-            params.no_kv_offload = true;
+        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_kv_offload = !value;
         }
-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
     add_opt(common_arg(
+        {"--repack"},
         {"-nr", "--no-repack"},
-        "disable weight repacking",
-        [](common_params & params) {
-            params.no_extra_bufts = true;
+        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_extra_bufts = !value;
         }
-    ).set_env("LLAMA_ARG_NO_REPACK"));
+    ).set_env("LLAMA_ARG_REPACK"));
     add_opt(common_arg(
         {"--no-host"},
         "bypass host buffer allowing extra buffers to be used",
@@ -1843,20 +1898,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-cb", "--cont-batching"},
-        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.cont_batching = true;
+        {"-nocb", "--no-cont-batching"},
+        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.cont_batching = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
     add_opt(common_arg(
-        {"-nocb", "--no-cont-batching"},
-        "disable continuous batching",
-        [](common_params & params) {
-            params.cont_batching = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
-    add_opt(common_arg(
-        {"--mmproj"}, "FILE",
+        {"-mm", "--mmproj"}, "FILE",
         "path to a multimodal projector file. see tools/mtmd/README.md\n"
         "note: if -hf is used, this argument can be omitted",
         [](common_params & params, const std::string & value) {
@@ -1864,26 +1913,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
     add_opt(common_arg(
-        {"--mmproj-url"}, "URL",
+        {"-mmu", "--mmproj-url"}, "URL",
         "URL to a multimodal projector file. see tools/mtmd/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
     add_opt(common_arg(
-        {"--no-mmproj"},
-        "explicitly disable multimodal projector, useful when using -hf",
-        [](common_params & params) {
-            params.no_mmproj = true;
+        {"--mmproj-auto"},
+        {"--no-mmproj", "--no-mmproj-auto"},
+        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_mmproj = !value;
         }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
     add_opt(common_arg(
+        {"--mmproj-offload"},
         {"--no-mmproj-offload"},
-        "do not offload multimodal projector to GPU",
-        [](common_params & params) {
-            params.mmproj_use_gpu = false;
+        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.mmproj_use_gpu = value;
         }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@@ -1923,12 +1974,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_MLOCK"));
     add_opt(common_arg(
+        {"--mmap"},
         {"--no-mmap"},
-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](common_params & params) {
-            params.use_mmap = false;
+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_mmap = value;
         }
-    ).set_env("LLAMA_ARG_NO_MMAP"));
+    ).set_env("LLAMA_ARG_MMAP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
@@ -2116,10 +2168,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ));
     add_opt(common_arg(
+        {"--op-offload"},
         {"--no-op-offload"},
-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
-        [](common_params & params) {
-            params.no_op_offload = true;
+        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+        [](common_params & params, bool value) {
+            params.no_op_offload = !value;
         }
     ));
     add_opt(common_arg(
@@ -2315,10 +2368,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
+        {"--ppl"},
         {"--no-ppl"},
-        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params) {
-            params.compute_ppl = false;
+        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.compute_ppl = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
@@ -2437,12 +2491,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
     add_opt(common_arg(
+        {"--webui"},
         {"--no-webui"},
-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.webui = false;
+        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.webui = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
     add_opt(common_arg(
         {"--embedding", "--embeddings"},
         string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2547,18 +2602,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
     add_opt(common_arg(
         {"--slots"},
-        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_slots = true;
+        {"--no-slots"},
+        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.endpoint_slots = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
-    add_opt(common_arg(
-        {"--no-slots"},
-        "disables slots monitoring endpoint",
-        [](common_params & params) {
-            params.endpoint_slots = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
@@ -2609,26 +2658,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
     add_opt(common_arg(
+        {"--models-autoload"},
         {"--no-models-autoload"},
-        "disables automatic loading of models (default: enabled)",
-        [](common_params & params) {
-            params.models_autoload = false;
+        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.models_autoload = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
     add_opt(common_arg(
         {"--jinja"},
-        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.use_jinja = true;
+        {"--no-jinja"},
+        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_jinja = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
-    add_opt(common_arg(
-        {"--no-jinja"},
-        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.use_jinja = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2673,15 +2717,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
     add_opt(common_arg(
+        {"--prefill-assistant"},
         {"--no-prefill-assistant"},
         string_format(
             "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
             "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
         ),
-        [](common_params & params) {
-            params.prefill_assistant = false;
+        [](common_params & params, bool value) {
+            params.prefill_assistant = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/arg.h b/common/arg.h
index 219c115e63..6db38da488 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -16,6 +16,7 @@ struct common_arg {
     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
     std::set<enum llama_example> excludes = {};
     std::vector<const char *> args;
+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
     const char * value_hint   = nullptr; // help text or example for arg value
     const char * value_hint_2 = nullptr; // for second arg value
     const char * env          = nullptr;
@@ -25,6 +26,7 @@ struct common_arg {
     void (*handler_string) (common_params & params, const std::string &) = nullptr;
     void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
     void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_bool)   (common_params & params, bool) = nullptr;
 
     common_arg() = default;
 
@@ -48,6 +50,13 @@ struct common_arg {
         void (*handler)(common_params & params)
     ) : args(args), help(help), handler_void(handler) {}
 
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::initializer_list<const char *> & args_neg,
+        const std::string & help,
+        void (*handler)(common_params & params, bool)
+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
+
     // support 2 values for arg
     common_arg(
         const std::initializer_list<const char *> & args,
@@ -80,6 +89,10 @@ struct common_arg {
         }
         return strcmp(args[0], other.args[0]) == 0;
     }
+
+    // get all args and env vars (including negated args/env)
+    std::vector<std::string> get_args() const;
+    std::vector<std::string> get_env() const;
 };
 
 namespace common_arg_utils {
diff --git a/common/download.cpp b/common/download.cpp
index ab68c53b43..ef87472560 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -12,6 +12,8 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
+#include <map>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 
 #elif defined(LLAMA_USE_HTTPLIB)
 
-static bool is_output_a_tty() {
+class ProgressBar {
+    static inline std::mutex mutex;
+    static inline std::map<const ProgressBar *, int> lines;
+    static inline int max_line = 0;
+
+    static void cleanup(const ProgressBar * line) {
+        lines.erase(line);
+        if (lines.empty()) {
+            max_line = 0;
+        }
+    }
+
+    static bool is_output_a_tty() {
 #if defined(_WIN32)
-    return _isatty(_fileno(stdout));
+        return _isatty(_fileno(stdout));
 #else
-    return isatty(1);
+        return isatty(1);
 #endif
-}
-
-static void print_progress(size_t current, size_t total) {
-    if (!is_output_a_tty()) {
-        return;
     }
 
-    if (!total) {
-        return;
+public:
+    ProgressBar() = default;
+
+    ~ProgressBar() {
+        std::lock_guard<std::mutex> lock(mutex);
+        cleanup(this);
     }
 
-    size_t width = 50;
-    size_t pct = (100 * current) / total;
-    size_t pos = (width * current) / total;
+    void update(size_t current, size_t total) {
+        if (!is_output_a_tty()) {
+            return;
+        }
 
-    std::cout << "["
-              << std::string(pos, '=')
-              << (pos < width ? ">" : "")
-              << std::string(width - pos, ' ')
-              << "] " << std::setw(3) << pct << "%  ("
-              << current / (1024 * 1024) << " MB / "
-              << total / (1024 * 1024) << " MB)\r";
-    std::cout.flush();
-}
+        if (!total) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (lines.find(this) == lines.end()) {
+            lines[this] = max_line++;
+            std::cout << "\n";
+        }
+        int lines_up = max_line - lines[this];
+
+        size_t width = 50;
+        size_t pct = (100 * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "A";
+        }
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";
+
+        std::cout.flush();
+
+        if (current == total) {
+             cleanup(this);
+        }
+    }
+
+    ProgressBar(const ProgressBar &) = delete;
+    ProgressBar & operator=(const ProgressBar &) = delete;
+};
 
 static bool common_pull_file(httplib::Client & cli,
                              const std::string & resolve_path,
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
     const char * func = __func__; // avoid __func__ inside a lambda
     size_t downloaded = existing_size;
     size_t progress_step = 0;
+    ProgressBar bar;
 
     auto res = cli.Get(resolve_path, headers,
         [&](const httplib::Response &response) {
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
             progress_step += len;
 
             if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                print_progress(downloaded, total_size);
+                bar.update(downloaded, total_size);
                 progress_step = 0;
             }
             return true;
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
         nullptr
     );
 
-    std::cout << "\n";
-
     if (!res) {
         LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
         return false;
diff --git a/common/preset.cpp b/common/preset.cpp
index 09ac171b72..729c27f2cf 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
         if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
             // flag option, no value
             if (common_arg_utils::is_falsey(value)) {
-                // skip the flag
-                args.pop_back();
+                // use negative arg if available
+                if (!opt.args_neg.empty()) {
+                    args.back() = opt.args_neg.back();
+                } else {
+                    // otherwise, skip the flag
+                    // TODO: maybe throw an error instead?
+                    args.pop_back();
+                }
             }
         }
         if (opt.value_hint != nullptr) {
@@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
 static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
     std::map<std::string, common_arg> mapping;
     for (const auto & opt : ctx_params.options) {
-        if (opt.env != nullptr) {
-            mapping[opt.env] = opt;
+        for (const auto & env : opt.get_env()) {
+            mapping[env] = opt;
         }
-        for (const auto & arg : opt.args) {
+        for (const auto & arg : opt.get_args()) {
             mapping[rm_leading_dashes(arg)] = opt;
         }
     }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 143fd82ee0..80262d141c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7370,6 +7370,10 @@ class DeepseekV2Model(TextModel):
             self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
             self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
             self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+            # ref https://github.com/ggml-org/llama.cpp/pull/17945
             self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
             self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
 
@@ -10131,6 +10135,10 @@ class MistralMoeModel(DeepseekV2Model):
         MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
         yarn_params = self.hparams["yarn"]
         self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
+
+        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+        # ref https://github.com/ggml-org/llama.cpp/pull/17945
         self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
diff --git a/docs/docker.md b/docs/docker.md
index 98502a0c50..b9e5015396 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
 or with a server image:
 
 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
 ```
 
 ## Docker With CUDA
@@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 
 ## Docker With MUSA
@@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
index 420195f198..e9f7bf9313 100644
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
     file << "| `";
     // args
-    for (const auto & arg : opt.args) {
-    if (arg == opt.args.front()) {
+    auto all_args = opt.get_args();
+    for (const auto & arg : all_args) {
+    if (arg == all_args.front()) {
             file << arg;
-            if (opt.args.size() > 1) file << ", ";
+            if (all_args.size() > 1) file << ", ";
         } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
+            file << arg << (arg != all_args.back() ? ", " : "");
         }
     }
     // value hint
diff --git a/examples/model-conversion/scripts/causal/compare-logits.py b/examples/model-conversion/scripts/causal/compare-logits.py
index afa0d5b263..894302c69e 100755
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -1,10 +1,13 @@
 #!/usr/bin/env python3
 
-import numpy as np
 import sys
-import os
+import numpy as np
 from pathlib import Path
 
+# Add utils directory to path for direct script execution
+sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]
+
 def quick_logits_check(pytorch_file, llamacpp_file):
     """Lightweight sanity check before NMSE"""
 
@@ -32,27 +35,16 @@ def quick_logits_check(pytorch_file, llamacpp_file):
     print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
     print(f"Max absolute difference: {max_diff:.4f}")
 
-    if max_diff > 1.0:
-        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
-        return False
-
     return True
 
 def main():
-    model_path = os.getenv('MODEL_PATH')
-    if not model_path:
-        print("Error: MODEL_PATH environment variable not set")
-        sys.exit(1)
-
-    if not os.path.exists(model_path):
-        print(f"Error: Model file not found: {model_path}")
-        sys.exit(1)
-
-    model_name = os.path.basename(model_path)
+    model_name = get_model_name_from_env_path('MODEL_PATH')
     data_dir = Path("data")
-
     pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
+
+    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
+    print(f"Using converted model: {llamacpp_model_name}")
+    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
 
     if not pytorch_file.exists():
         print(f"Error: PyTorch logits file not found: {pytorch_file}")
diff --git a/examples/model-conversion/scripts/utils/__init__.py b/examples/model-conversion/scripts/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/model-conversion/scripts/utils/check-nmse.py b/examples/model-conversion/scripts/utils/check-nmse.py
index 939e3153cc..83f63f9ff3 100755
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -5,6 +5,7 @@ import sys
 import os
 import argparse
 from pathlib import Path
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]
 
 def calculate_nmse(reference, test):
     mse = np.mean((test - reference) ** 2)
@@ -67,11 +68,13 @@ def main():
     parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
     args = parser.parse_args()
 
-    model_name = os.path.basename(args.model_path)
+    model_name = get_model_name_from_env_path('MODEL_PATH')
     data_dir = Path("data")
 
     pytorch_file = data_dir / f"pytorch-{model_name}.bin"
-    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
+
+    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
+    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
 
     print(f"Model name: {model_name}")
     print(f"PyTorch logits file: {pytorch_file}")
diff --git a/examples/model-conversion/scripts/utils/common.py b/examples/model-conversion/scripts/utils/common.py
new file mode 100644
index 0000000000..945f9a1a1d
--- /dev/null
+++ b/examples/model-conversion/scripts/utils/common.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+def get_model_name_from_env_path(env_path_name):
+    model_path = os.getenv(env_path_name)
+    if not model_path:
+        print(f"Error: {env_path_name} environment variable not set")
+        sys.exit(1)
+
+    if not os.path.exists(model_path):
+        print(f"Error: Model file not found: {model_path}")
+        sys.exit(1)
+
+    name = os.path.basename(os.path.normpath(model_path))
+    if name.endswith(".gguf"):
+        name = name[:-5]
+
+    return name
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index a8e53f28eb..0d11d0f803 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
     LOG_INF("target:\n\n");
     common_perf_print(ctx_tgt, smpl);
 
+    llama_batch_free(batch_tgt);
+
     common_sampler_free(smpl);
     common_speculative_free(spec);
 
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 9edd485136..4f3b99c8d0 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -99,6 +99,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_sme        (void);
     // other
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a5995fdc2c..ec16cbda9f 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, addr, tensor);
-#endif
-
     struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
 
     // see if we can merge with an existing block
@@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
     }
     // otherwise, add a new block
     ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
-
-    GGML_UNUSED(tensor);
 }
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
 
     GGML_ASSERT(parent_size >= node_size);
 
+    // note: we want after the freeing the chunks to continue to be aligned
+    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
+    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
+
     if (parent_size > node_size) {
-        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
         struct buffer_address p_addr = p_hn->addr;
         p_addr.offset += node_size;
         size_t extra_size = parent_size - node_size;
         AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
     }
 }
 
@@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
     struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
     ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, hn->addr, node);
+#endif
+
+    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
     hn->allocated = false;
 }
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 81288464c7..da624c587c 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
+            return true;
         case GGML_OP_PAD:
             // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
             return ggml_get_op_params_i32(op, 8) == 0;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index c47511adcb..a59b518938 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif
 
+#if defined(__riscv)
+struct ggml_riscv_arch_features_type {
+    int rvv_vlen;
+} ggml_riscv_arch_features = { 0 };
+#endif
 
 #if defined(_WIN32)
 
@@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH
 
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+#include <riscv_vector.h>
+static void ggml_init_riscv_arch_features(void) {
+    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
+}
+#else
+static void ggml_init_riscv_arch_features(void) {}
+#endif
+
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
 
@@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }
 
+int ggml_cpu_get_rvv_vlen(void) {
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+    return ggml_riscv_arch_features.rvv_vlen;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
     return 1;
@@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
         ggml_init_arm_arch_features();
 #endif
 
+#if defined(__riscv)
+        ggml_init_riscv_arch_features();
+#endif
+
         is_first_call = false;
     }
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 3191faaa4c..f4713a4218 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_riscv_v()) {
             features.push_back({ "RISCV_V", "1" });
         }
+        if (ggml_cpu_get_rvv_vlen() > 0) {
+            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
+            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
+        }
         if (ggml_cpu_has_vsx()) {
             features.push_back({ "VSX", "1" });
         }
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 9f0d449bd6..b70ea7d78b 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
     static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
 
     if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
+            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
             if (cur->ne[1] % 8 == 0) {
                 return &q4_0_8x8_q8_0;
             }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index c4529f5d94..9fcb2f9fd2 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -67,19 +67,22 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA3_5    (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
 #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
 
-#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
-#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
-#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
-#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
-#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
-#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_AMD(cc)     (cc >= GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_RDNA(cc)    (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA1(cc)   (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA2(cc)   (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
+#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3(cc)   (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
+#define GGML_CUDA_CC_IS_RDNA4(cc)   (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_GCN(cc)     (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
+#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
 
 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index 2750117aa9..8dc82a9d3b 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
     const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
     const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;
 
-    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0      = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
 
     const bool did_not_have_any_data   = kbc0 == kbc0_stop;
     const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
     int bidx = bidx0 - 1;
     int kbc_stop = kbc0;
     while(true) {
-        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+        const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
         if (kbc == kbc_stop) { // Did not have any data.
             bidx--;
             kbc_stop = kbc;
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index d51537f7d0..7bd1044c19 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
     const int iter_j = (ne01.z + (ncols1    - 1)) / ncols1;
 
     // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    int       kbc      = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
 
     // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
     // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
         const float2 * Q_f2   = (const float2 *) (Q + nb03*sequence + nb02* head0);
         const half2  * K_h2   = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
         const half   * mask_h = ncols2 == 1 && !mask ? nullptr :
-            (const half  *) (mask + nb33*(sequence % ne33));
+            (const half *) (mask + nb33*(sequence % ne33));
         float2       * dstk   = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
 
         const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 8d17bc669a..ab0f6fe9ce 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CUMSUM:
         case GGML_OP_TRI:
         case GGML_OP_DIAG:
-            return true;
         case GGML_OP_SOLVE_TRI:
-            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
+            return true;
+
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 0b13293da9..dcfa40f4d5 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
                 return 8 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                 return 2 * l + (threadIdx.x / 16);
+#else
+                NO_DEVICE_CODE;
+                return -1;
 #endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
@@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
         static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
         half2 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
@@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
-        static constexpr int         ne = I * J / WARP_SIZE;
-
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 
 #if defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
+        static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
         static constexpr __device__ bool supported() {
             if (I == 16 && J == 8) return true;
             return false;
@@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
             }
         }
 #else
+        static constexpr int ne = I * J / WARP_SIZE;
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
         static constexpr __device__ bool supported() {
             if (I ==  8 && J ==  8) return true;
             if (I == 16 && J ==  4) return true;
@@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
         }
 #elif defined(AMD_WMMA_AVAILABLE)
         if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-
+#if defined(RDNA4)
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+#elif defined(RDNA3)
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
+#else
+                NO_DEVICE_CODE;
+#endif // defined(RDNA4)
         } else if constexpr (std::is_same_v<T, int>) {
             if constexpr (I == 16 && J == 4) {
                 int64_t * xi = (int64_t *) t.x;
@@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
         const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
         const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
         acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
+        const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
         GGML_UNUSED_VARS(D, A, B);
@@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
         const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
         const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
         acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
+        const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
         GGML_UNUSED_VARS(D, A, B);
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 7cf33f0ddf..6643f243b1 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16) {
+        if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
+            return false;
+        } else if (src1_ncols > 16) {
             return false;
         }
     }
@@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
         case GGML_TYPE_F32:
             return ampere_mma_available(cc);
         case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
         case GGML_TYPE_BF16:
-            return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return ampere_mma_available(cc) || amd_wmma_available(cc);
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 6238ce7ebd..32948e4d7a 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+                        return ne11 <= 3;
+                    }
+                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
                         return ne11 <= 5;
                     }
                     return ne11 <= 2;
diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu
index e161d4dc43..177ffc268f 100644
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@@ -3,6 +3,80 @@
 #include "solve_tri.cuh"
 
 #define MAX_N_FAST 64
+#define MAX_K_FAST 32
+
+static __global__ void get_batch_pointers(const float *  A,
+                                          float *        X,
+                                          const float ** A_ptrs,
+                                          float **       X_ptrs,
+                                          int64_t        ne02,
+                                          int64_t        total_batches,
+                                          size_t         s02,
+                                          size_t         s03,
+                                          size_t         s2,
+                                          size_t         s3) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_batches) {
+        return;
+    }
+
+    const int64_t i3 = idx / ne02;
+    const int64_t i2 = idx % ne02;
+
+    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
+    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
+}
+
+static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
+                                 const float *               A,
+                                 const float *               B,
+                                 float *                     X,
+                                 int                         n,
+                                 int                         k,
+                                 int64_t                     ne02,
+                                 int64_t                     ne03,
+                                 size_t                      s02,
+                                 size_t                      s03,
+                                 size_t                      s12,
+                                 size_t                      s13,
+                                 size_t                      s2,
+                                 size_t                      s3,
+                                 cudaStream_t                stream) {
+    const float   alpha         = 1.0f;
+    const int64_t total_batches = ne02 * ne03;
+    if (total_batches == 0) {
+        return;
+    }
+
+    // Bulk copy B -> X (contiguous tensors)
+    if (X != B) {
+        const int64_t total_elements_BX = n * k * total_batches;
+        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+    }
+
+    const int id = ggml_cuda_get_device();
+
+    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
+    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
+
+    const float ** A_ptrs_dev = A_ptrs_alloc.get();
+    float **       X_ptrs_dev = X_ptrs_alloc.get();
+
+    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
+                                                                        total_batches, s02, s03, s2, s3);
+
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+
+    // Yes, this is necessary, without this we get RMSE errors
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
+    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
+
+    // revert to standard mode from common.cuh
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
+
+    GGML_UNUSED_VARS(s12, s13);
+}
 
 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
     float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
     float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
 
-    const int half = WARP_SIZE;
+    const int half      = WARP_SIZE;
     const int nrows_low = (n < half) ? n : half;
 
 #pragma unroll
@@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
 
 #pragma unroll
     for (int row = half; row < n; ++row) {
-        float sum = sA[row * n + lane] * x_low;
-        const int j = half + lane;
+        float     sum = sA[row * n + lane] * x_low;
+        const int j   = half + lane;
         if (j < row) {
             sum += sA[row * n + j] * x_high;
         }
@@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
     for (int rr = 0; rr < 2; ++rr) {
         const int row = rr * WARP_SIZE + lane;
         if (row < n) {
-            const float val = (row < half) ? x_low : x_high;
+            const float val            = (row < half) ? x_low : x_high;
             X_batch[row * k + col_idx] = val;
         }
     }
@@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
 }
 
 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
+    const ggml_tensor * src1 = dst->src[1];  // B (n×k)
 
     ggml_is_contiguous(src0);
     ggml_is_contiguous(src1);
 
-    const int64_t n = src0->ne[0];
-    const int64_t k = src1->ne[0];
+    const int64_t n    = src0->ne[0];
+    const int64_t k    = src1->ne[0];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
 
-    GGML_ASSERT(n <= 64);
-    GGML_ASSERT(k <= 32);
-
-    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
-                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                       dst->nb[3] / sizeof(float), ctx.stream());
+    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
+        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                           dst->nb[3] / sizeof(float), ctx.stream());
+    } else {
+        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                             dst->nb[3] / sizeof(float), ctx.stream());
+    }
 }
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index b7d6edf7fc..951a88d567 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,6 +19,9 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@@ -30,6 +33,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
+#define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 8c55a2e4e5..221e67f96a 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -12,11 +12,16 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
+#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
+#define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c6f5809ccd..34ec09d403 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -659,6 +659,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_cos_f32;
     vk_pipeline pipeline_log[2];
     vk_pipeline pipeline_tri[2];
+    vk_pipeline pipeline_diag[2];
     vk_pipeline pipeline_clamp_f32;
     vk_pipeline pipeline_pad_f32;
     vk_pipeline pipeline_roll_f32;
@@ -722,6 +723,11 @@ struct vk_device_struct {
     vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
     vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
     vk_pipeline pipeline_soft_max_back_f32;
+
+    vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
+    vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
+    vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
+
     vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
     vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
     vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@@ -757,7 +763,8 @@ struct vk_device_struct {
 
     vk_pipeline pipeline_flash_attn_split_k_reduce;
 
-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
+    // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
+    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
 
     std::vector<vk_pipeline_ref> all_pipelines;
 
@@ -1149,6 +1156,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
 
 struct vk_op_topk_moe_push_constants {
     uint32_t n_rows;
+    uint32_t n_experts_push;
     uint32_t n_expert_used;
     float clamp_min;
     float clamp_max;
@@ -3730,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS],  "get_rows_iq4_xs",  get_rows_iq4_xs_len,  get_rows_iq4_xs_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl",  get_rows_iq4_nl_len,  get_rows_iq4_nl_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4],   "get_rows_mxfp4",   get_rows_mxfp4_len,   get_rows_mxfp4_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32],     "get_rows_i32",     get_rows_i32_len,     get_rows_i32_data,     "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -3917,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
@@ -3996,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
 
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32,     "soft_max_large1_f32",     soft_max_large1_f32_len,     soft_max_large1_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32,     "soft_max_large2_f32",     soft_max_large2_f32_len,     soft_max_large2_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32,     "soft_max_large3_f32",     soft_max_large3_f32_len,     soft_max_large3_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+
     ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@@ -4204,10 +4223,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
 
-    for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
+    for (uint32_t use_push = 0; use_push < 2; ++use_push) {
+        for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
+        }
     }
 
     for (auto &c : compiles) {
@@ -8274,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
     switch (op) {
     case GGML_OP_GET_ROWS:
         GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        if (src0->type == GGML_TYPE_I32) {
+            // i32 src only supports i32 result
+            GGML_ASSERT(dst->type == GGML_TYPE_I32);
+            return ctx->device->pipeline_get_rows[src0->type];
+        }
         if (dst->type == GGML_TYPE_F16) {
             return ctx->device->pipeline_get_rows[src0->type];
         }
@@ -8400,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
         }
         return nullptr;
+    case GGML_OP_DIAG:
+        if (src0->type == dst->type &&
+            (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
+            return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
+        }
+        return nullptr;
     case GGML_OP_CLAMP:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_clamp_f32;
@@ -8554,7 +8586,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
             GGML_ASSERT(idx < num_topk_moe_pipelines);
             topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-            return ctx->device->pipeline_topk_moe[idx][mode];
+            // use n_experts from push constant if it's not equal to the power of two spec constant
+            bool use_push = dst->ne[0] != (1u << idx);
+            return ctx->device->pipeline_topk_moe[idx][mode][use_push];
         }
 
         if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@@ -9091,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     case GGML_OP_COS:
     case GGML_OP_LOG:
     case GGML_OP_TRI:
+    case GGML_OP_DIAG:
     case GGML_OP_CLAMP:
     case GGML_OP_PAD:
     case GGML_OP_ROLL:
@@ -9778,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
     ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
 }
 
+static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
+
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
+}
+
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
     p.param1 = ggml_get_op_params_f32(dst, 0);
@@ -10111,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-    ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
+    vk_op_soft_max_push_constants pc {
         ncols,
         src1 != nullptr ? nrows_y : (uint32_t)0,
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
@@ -10122,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
         n_head_log2,
         nrows_x,
         src2 != nullptr
-    });
+    };
+
+    if (ncols <= 16384) {
+        ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
+    } else {
+
+        vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
+        vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
+        vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
+        vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
+
+        uint32_t elems_per_wg = 128 * 4;
+        uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
+        size_t tmp_size = num_wgs * nrows_x * sizeof(float);
+
+        if (ctx->prealloc_size_x < tmp_size) {
+            ctx->prealloc_size_x = tmp_size;
+            ggml_vk_preallocate_buffers(ctx, subctx);
+        }
+        if (ctx->prealloc_size_y < tmp_size) {
+            ctx->prealloc_size_y = tmp_size;
+            ggml_vk_preallocate_buffers(ctx, subctx);
+        }
+        if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
+            ggml_vk_sync_buffers(ctx, subctx);
+        }
+
+        vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
+        vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
+
+        std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
+
+        vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
+        vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
+        vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
+
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
+
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+        ggml_vk_sync_buffers(ctx, subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+        ggml_vk_sync_buffers(ctx, subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+
+        ctx->prealloc_x_need_sync = true;
+        ctx->prealloc_y_need_sync = true;
+    }
 }
 
 static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -10158,6 +10247,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
 
     vk_op_topk_moe_push_constants pc {};
     pc.n_rows = n_rows;
+    pc.n_experts_push = n_experts;
     pc.n_expert_used = n_expert_used;
     if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
         ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
@@ -11857,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_TRI:
         ggml_vk_tri(ctx, compute_ctx, src0, node);
 
+        break;
+    case GGML_OP_DIAG:
+        ggml_vk_diag(ctx, compute_ctx, src0, node);
+
         break;
     case GGML_OP_CLAMP:
         ggml_vk_clamp(ctx, compute_ctx, src0, node);
@@ -12832,8 +12926,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
     }
 
     const int n_expert = softmax->ne[0];
-    // n_expert must be a power of 2
-    if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
+    if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
         return false;
     }
 
@@ -13877,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_MXFP4:
+                    case GGML_TYPE_I32:
                         return true;
                     default:
                         return false;
@@ -14001,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
             return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_LOG:
         case GGML_OP_TRI:
+        case GGML_OP_DIAG:
             return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                    op->type == op->src[0]->type;
         case GGML_OP_ARGSORT:
@@ -14591,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
             tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
         } else if (tensor->op == GGML_OP_TRI) {
             tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
+        } else if (tensor->op == GGML_OP_DIAG) {
+            tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
         } else if (tensor->op == GGML_OP_CLAMP) {
             const float * params = (const float *)tensor->op_params;
             tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
new file mode 100644
index 0000000000..cd3f42f491
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
@@ -0,0 +1,29 @@
+#version 450
+
+#include "rte.glsl"
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12_offset = i12*p.ne11*p.ne10;
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
+
+    if (i10 == i11) {
+        const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
+    } else {
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
index 76d83041ce..e88bdd057e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@@ -26,9 +26,9 @@ void main() {
             const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 
 #if defined(DATA_A_BF16)
-            FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
+            TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
 #else
-            FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
+            TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
 #endif
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
             data_d[d_offset + i00] = D_TYPE(v);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
new file mode 100644
index 0000000000..39c4663912
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
@@ -0,0 +1,62 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    float slope = get_slope(rowx);
+
+    // Find max
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        FLOAT_TYPE a = FLOAT_TYPE(0);
+        if (col < p.KX) {
+            a = data_a[rowx * p.KX + col];
+        }
+
+        FLOAT_TYPE b = FLOAT_TYPE(0);
+        if (p.KY > 0 && col < p.KX) {
+            b = data_b[rowy_start + col];
+        }
+
+        FLOAT_TYPE v = a * p.scale + slope * b;
+
+        if (col < p.KX) {
+            max_val = max(max_val, v);
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(vals[tid], vals[tid + s]);
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        max_val = vals[0];
+        data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
new file mode 100644
index 0000000000..69524f5f75
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
@@ -0,0 +1,79 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    float slope = get_slope(rowx);
+
+    // Find max
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+
+    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
+        if (i + tid < gl_NumWorkGroups.x) {
+            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(max_val, vals[tid + s]);
+        }
+        barrier();
+    }
+
+    max_val = vals[0];
+    barrier();
+
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
+
+    // Compute sum{exp(x - max)}
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            break;
+        }
+
+        // compute exp(a*scale+b*slope), add it to sum
+        const uint i = rowx * p.KX + col;
+        FLOAT_TYPE val;
+        val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
+        sum += val;
+        data_d[i] = D_TYPE(val);
+    }
+
+    // reduce across the workgroup
+    vals[tid] = sum;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] += vals[tid + s];
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        sum = vals[0];
+        data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
new file mode 100644
index 0000000000..06efd7d9fb
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
@@ -0,0 +1,65 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+shared FLOAT_TYPE sumsh[BLOCK_SIZE];
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
+
+    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
+        if (i + tid < gl_NumWorkGroups.x) {
+            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
+            sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    sumsh[tid] = sum;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(max_val, vals[tid + s]);
+            sumsh[tid] += sumsh[tid + s];
+        }
+        barrier();
+    }
+
+    max_val = vals[0];
+    sum = sumsh[0];
+
+    if (p.has_sinks != 0) {
+        sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
+    }
+
+    FLOAT_TYPE rcpdivisor = 1.0/sum;
+
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            continue;
+        }
+
+        data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
new file mode 100644
index 0000000000..6636d1f8de
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
@@ -0,0 +1,53 @@
+#extension GL_EXT_control_flow_attributes : enable
+
+layout (push_constant) uniform parameter
+{
+    uint KX;
+    uint KY;
+    uint ne00;
+    uint ne01;
+    uint ne02;
+    uint ne12;
+    uint ne13;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
+    uint nrows_x;
+    uint has_sinks;
+} p;
+
+#include "types.glsl"
+
+layout(constant_id = 0) const uint BLOCK_SIZE = 128;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+layout(constant_id = 1) const uint num_iters = 4;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
+layout (binding = 2) readonly buffer Z {float data_c[];};
+layout (binding = 3) buffer D {D_TYPE data_d[];};
+layout (binding = 4) buffer M {float data_m[];};
+layout (binding = 5) buffer S {float data_s[];};
+
+shared FLOAT_TYPE vals[BLOCK_SIZE];
+
+float get_slope(uint rowx) {
+    float slope = 1.0f;
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        const uint h = (rowx / p.ne01) % p.ne02; // head index
+
+        const float base = h < p.n_head_log2 ? p.m0 : p.m1;
+        const uint   exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    return slope;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
index 5cd0785d20..b83a2b9d2d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@@ -10,6 +10,7 @@
 layout (push_constant) uniform parameter
 {
     uint n_rows;
+    uint n_experts_push;
     uint n_expert_used;
     float clamp_min;
     float clamp_max;
@@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 
 layout(constant_id = 0) const uint WARP_SIZE = 32;
-layout(constant_id = 1) const uint n_experts = 512;
+layout(constant_id = 1) const uint n_experts_spec = 512;
 layout(constant_id = 2) const bool with_norm = true;
 layout(constant_id = 3) const bool late_softmax = false;
+layout(constant_id = 4) const bool nexperts_use_push = false;
 
-const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
+uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
 
 layout (binding = 0, std430) readonly buffer Logits {float logits[];};
 layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
@@ -94,7 +100,7 @@ void main() {
     }
 
     if (!late_softmax) {
-        softmax_warp_inplace(wt, n_experts, lane, false);
+        softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
     }
 
     // at this point, each thread holds a portion of softmax,
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index 92bae088b2..b0ade078c7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -704,13 +704,15 @@ void process_shaders() {
         shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
 
         if (tname == "f16") {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
         } else {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
         }
-        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
+        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
     }
 
+    string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
+
     string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
     string_to_spv("mul_mat_vec_p021_f16_f32",              "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
     string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
@@ -854,6 +856,8 @@ void process_shaders() {
 
     string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
     string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("diag_f16",       "diag.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
+    string_to_spv("diag_f32",       "diag.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
 
     string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
     string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
@@ -899,6 +903,13 @@ void process_shaders() {
     string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
     string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
 
+    string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+
     string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
     string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
     string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 5320fe5864..a7bc007bdc 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,5 +1,5 @@
 {
-  "extraPaths": ["gguf-py"],
+  "extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
   "pythonVersion": "3.9",
   "pythonPlatform": "All",
   "reportUnusedImport": "warning",
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 86a1a4ba18..386fab04ac 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
     udata->output    .resize(n_tokens);
 
+    udata->seq_id_data.reserve(n_tokens);
+
     seq_set_t seq_set_unq;
 
     for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
 
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        udata->seq_id[i]   = batch.seq_id[idxs[i]];
         udata->output[i]   = batch.logits[idxs[i]];
 
         for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+            udata->seq_id_data.push_back(seq_id);
+            seq_set_unq.set(seq_id);
         }
 
         if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
     }
 
+    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        udata->seq_id[i] = seq_id_ptr;
+        seq_id_ptr += udata->n_seq_id[i];
+    }
+
     for (uint32_t s = 0; s < n_seq_max; ++s) {
         if (seq_set_unq.test(s)) {
             udata->seq_idx[s] = udata->seq_id_unq.size();
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 209cf3699d..8e6fac0efa 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -56,13 +56,15 @@ struct llama_ubatch {
         std::vector<float>          embd;
         std::vector<llama_pos>      pos;
         std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
         std::vector<llama_seq_id>   seq_id_unq;
         std::vector<int32_t>        seq_idx;
         std::vector<int8_t>         output;
+
+        std::vector<llama_seq_id> seq_id_data;
     };
 
-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
     std::shared_ptr<data_t> data;
 };
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 47484d9d97..97fec908b5 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     freq_base        (cparams.rope_freq_base),
     freq_scale       (cparams.rope_freq_scale),
     ext_factor       (cparams.yarn_ext_factor),
-    attn_factor      (cparams.yarn_attn_factor),
+    attn_factor      (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
     beta_fast        (cparams.yarn_beta_fast),
     beta_slow        (cparams.yarn_beta_slow),
     norm_eps         (hparams.f_norm_eps),
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 8cdbaf69fc..277d0bcfd3 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -1,7 +1,9 @@
 #include "llama-hparams.h"
 
 #include "ggml.h"
+
 #include <cassert>
+#include <cmath>
 
 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
     if (dense_first) {
@@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
 
     return false;
 }
+
+float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
+    GGML_ASSERT(ext_factor >= 0.0f);
+
+    if (ext_factor != 0.0f) {
+        attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    }
+
+    return attn_factor;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6eff334a5f..c9960e9169 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -107,6 +107,7 @@ struct llama_hparams {
     float    rope_freq_base_train_swa;
     float    rope_freq_scale_train;
     float    rope_freq_scale_train_swa;
+
     uint32_t n_ctx_orig_yarn;
     float    rope_yarn_log_mul = 0.0f;
 
@@ -267,7 +268,13 @@ struct llama_hparams {
     // TODO: think of a better place for this function
     // TODO: pack the SWA params in a struct?
     static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
+
+    // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
+    // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
+    //
+    // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
+    //      https://github.com/ggml-org/llama.cpp/pull/17945
+    static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 58f4a198bb..51005ab31a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                       float   freq_scale) const {
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
 
-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+    const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
 
     const auto & n_rot     = hparams.n_rot;
     const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1384,9 +1385,11 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
 
     // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
     // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    /*
     const float yarn_attn_factor = (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_DEEPSEEK2OCR)
                                     ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
                                     : cparams.yarn_attn_factor;
+    */
 
     ggml_tensor * tmp;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d94354da20..ffaf54f038 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1639,7 +1639,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     // that have no expert_gating_func model parameter set
                     hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                 }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+
+                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+                    // cancel the factor from the convert script
+                    hparams.rope_yarn_log_mul /= 0.1f;
+                }
 
                 // (optional) temperature tuning - used by mistral-large
                 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
@@ -2272,9 +2277,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
 
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
 
                 // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                 if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2284,18 +2289,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     }
                 }
 
-                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
-                //       but may need further verification with other values
-                if (hparams.rope_yarn_log_mul != 0.0f) {
-                    float factor = 1.0f / hparams.rope_freq_scale_train;
-                    float mscale = 1.0f;
-                    float mscale_all_dims = hparams.rope_yarn_log_mul;
-                    static auto get_mscale = [](float scale, float mscale) {
-                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
-                    };
-                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
-                }
-
                 switch (hparams.n_layer) {
                     case 26: type = LLM_TYPE_3B; break;
                     case 34: type = LLM_TYPE_8B; break;
@@ -2306,6 +2299,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         default: throw std::runtime_error("unsupported model architecture");
     }
 
+    // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
+    if (hparams.rope_yarn_log_mul != 0.0f) {
+        const float factor = 1.0f / hparams.rope_freq_scale_train;
+
+        // note: here we assume `mscale == 1.0f`
+        // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
+              float mscale          = 1.0f;
+        const float mscale_all_dims = hparams.rope_yarn_log_mul;
+
+        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        // special-case DEEPSEEK v2:
+        // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
+        if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
+            mscale = mscale_all_dims;
+        }
+
+        static auto get_mscale = [](float scale, float mscale) {
+            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+        };
+
+        hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+
+        LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
+                __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
+    }
+
     pimpl->n_bytes = ml.n_bytes;
 
     pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
@@ -6842,6 +6861,7 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
         LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
         LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
         LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
         // MRoPE (Multi-axis Rotary Position Embedding) sections
         if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6905,7 +6925,6 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
         LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
         LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
     }
 
     if (arch == LLM_ARCH_QWEN2MOE) {
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 1599ea0d96..2ac31610ed 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -1,7 +1,5 @@
 #include "models.h"
 
-
-
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -21,9 +19,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
 
     // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
     // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
-    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
 
     ggml_tensor * cur;
     ggml_tensor * inpL;
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index a60ca12fe5..90750b20c2 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -20,20 +20,20 @@ int main(void) {
             std::unordered_set<std::string> seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
                 // check for args duplications
-                for (const auto & arg : opt.args) {
+                for (const auto & arg : opt.get_args()) {
                     if (seen_args.find(arg) == seen_args.end()) {
                         seen_args.insert(arg);
                     } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
                         exit(1);
                     }
                 }
                 // check for env var duplications
-                if (opt.env) {
-                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
-                        seen_env_vars.insert(opt.env);
+                for (const auto & env : opt.get_env()) {
+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(env);
                     } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
                         exit(1);
                     }
                 }
@@ -115,6 +115,14 @@ int main(void) {
     assert(params.model.path == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
+    printf("test-arg-parser: test negated environment variables\n\n");
+
+    setenv("LLAMA_ARG_MMAP", "0", true);
+    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.use_mmap == false);
+    assert(params.no_perf == true);
 
     printf("test-arg-parser: test environment variables being overwritten\n\n");
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 7be1f66038..416218b5b8 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7652,6 +7652,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
 
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,  true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
+
     for (float max_bias : {0.0f, 8.0f}) {
         for (float scale : {1.0f, 0.1f}) {
             for (int64_t ne0 : {16, 1024}) {
@@ -7861,9 +7864,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
 
     for (bool v : {false, true}) {
         for (bool circular : {false, true}) {
@@ -7956,8 +7974,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     for (bool with_norm : {false, true}) {
         test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
+        test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm));
         test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
+        test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm));
+        test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm));
         test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
+        test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm));
     }
 
     test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
@@ -8064,12 +8086,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
 
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
     // qwen3next with CHUNK_SIZE 64
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
     // qwen3next with CHUNK_SIZE 128
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
 
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 13ab7c78f4..6b80fe45f7 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -6,11 +6,26 @@ add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
             mtmd.h
+            mtmd-helper.cpp
+            mtmd-helper.h
             clip.cpp
             clip.h
             clip-impl.h
-            mtmd-helper.cpp
-            mtmd-helper.h
+            clip-model.h
+            clip-graph.h
+            models/models.h
+            models/cogvlm.cpp
+            models/internvl.cpp
+            models/kimivl.cpp
+            models/llama4.cpp
+            models/llava.cpp
+            models/minicpmv.cpp
+            models/pixtral.cpp
+            models/qwen2vl.cpp
+            models/qwen3vl.cpp
+            models/siglip.cpp
+            models/whisper-enc.cpp
+            models/deepseekocr.cpp
             )
 
 set_target_properties(mtmd PROPERTIES
@@ -53,6 +68,15 @@ if (TARGET BUILD_INFO)
     add_dependencies(mtmd-helper BUILD_INFO)
 endif()
 
+# if mtmd is linked against common, we throw an error
+if (TARGET mtmd)
+    get_target_property(libs mtmd LINK_LIBRARIES)
+    if (libs AND "common" IN_LIST libs)
+        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
+                            "It must not link against common")
+    endif()
+endif()
+
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
new file mode 100644
index 0000000000..6d303b4e48
--- /dev/null
+++ b/tools/mtmd/clip-graph.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+struct clip_graph {
+    const clip_model & model;
+    const clip_hparams & hparams;
+    projector_type proj_type;
+
+    // we only support single image per batch
+    const clip_image_f32 & img;
+
+    const int patch_size;
+    const int n_patches_x;
+    const int n_patches_y;
+    const int n_patches;
+    const int n_embd;
+    const int n_head;
+    const int d_head;
+    const int n_layer;
+    const int n_mmproj_embd;
+    const float eps;
+    const float kq_scale;
+    const clip_flash_attn_type flash_attn_type;
+
+    // for debugging
+    const bool debug_graph;
+    std::vector<ggml_tensor *> & debug_print_tensors;
+
+    ggml_context_ptr ctx0_ptr;
+    ggml_context * ctx0;
+    ggml_cgraph * gf;
+
+    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+    virtual ~clip_graph() = default;
+    virtual ggml_cgraph * build() = 0;
+
+    //
+    // utility functions
+    //
+    void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+    // siglip2 naflex
+    ggml_tensor * resize_position_embeddings();
+
+    // build vision transformer (ViT) cgraph
+    // this function should cover most of the models
+    // if your model has specific features, you should probably duplicate this function
+    ggml_tensor * build_vit(
+                ggml_tensor * inp,
+                int64_t n_pos,
+                norm_type norm_t,
+                ffn_op_type ffn_t,
+                ggml_tensor * learned_pos_embd,
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+    // build the input after conv2d (inp_raw --> patches)
+    // returns tensor with shape [n_embd, n_patches]
+    ggml_tensor * build_inp();
+
+    ggml_tensor * build_inp_raw(int channels = 3);
+
+    ggml_tensor * build_norm(
+            ggml_tensor * cur,
+            ggml_tensor * mw,
+            ggml_tensor * mb,
+            norm_type type,
+            float norm_eps,
+            int il) const;
+
+    ggml_tensor * build_ffn(
+            ggml_tensor * cur,
+            ggml_tensor * up,
+            ggml_tensor * up_b,
+            ggml_tensor * gate,
+            ggml_tensor * gate_b,
+            ggml_tensor * down,
+            ggml_tensor * down_b,
+            ffn_op_type type_op,
+            int il) const;
+
+    ggml_tensor * build_attn(
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_mask,
+            float kq_scale,
+            int il) const;
+
+    // implementation of the 2D RoPE without adding a new op in ggml
+    // this is not efficient (use double the memory), but works on all backends
+    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+    ggml_tensor * build_rope_2d(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
+    );
+
+    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+    // support dynamic resolution
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+};
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index f8c2a105df..4859376922 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "ggml.h"
 #include "gguf.h"
 #include "clip.h"
@@ -14,6 +16,8 @@
 
 // Internal header for clip.cpp
 
+#define MTMD_INTERNAL_HEADER
+
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
@@ -150,6 +154,10 @@
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 
+// forward declaration
+// TODO: improve this later
+struct clip_ctx;
+
 enum projector_type {
     PROJECTOR_TYPE_MLP,
     PROJECTOR_TYPE_MLP_NORM,
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
new file mode 100644
index 0000000000..46577a2a4a
--- /dev/null
+++ b/tools/mtmd/clip-model.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip.h"
+#include "clip-impl.h"
+
+#include <vector>
+#include <unordered_set>
+#include <cstdint>
+#include <cmath>
+
+enum ffn_op_type {
+    FFN_GELU,
+    FFN_GELU_ERF,
+    FFN_SILU,
+    FFN_GELU_QUICK,
+};
+
+enum norm_type {
+    NORM_TYPE_NORMAL,
+    NORM_TYPE_RMS,
+};
+
+enum patch_merge_type {
+    PATCH_MERGE_FLAT,
+    PATCH_MERGE_SPATIAL_UNPAD,
+};
+
+struct clip_hparams {
+    int32_t image_size = 0;
+    int32_t patch_size = 0;
+    int32_t n_embd = 0;
+    int32_t n_ff = 0;
+    int32_t projection_dim = 0;
+    int32_t n_head = 0;
+    int32_t n_layer = 0;
+    // idefics3
+    int32_t image_longest_edge = 0;
+    int32_t image_min_pixels = -1;
+    int32_t image_max_pixels = -1;
+    int32_t n_merge = 0; // number of patch merges **per-side**
+
+    float image_mean[3];
+    float image_std[3];
+
+    // for models using dynamic image size, we need to have a smaller image size to warmup
+    // otherwise, user will get OOM everytime they load the model
+    int32_t warmup_image_size = 0;
+    int32_t warmup_audio_size = 3000;
+
+    ffn_op_type ffn_op = FFN_GELU;
+
+    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
+
+    float eps = 1e-6;
+    float rope_theta = 0.0;
+
+    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
+    int32_t image_crop_resolution;
+    std::unordered_set<int32_t> vision_feature_layer;
+    int32_t attn_window_size = 0;
+    int32_t n_wa_pattern = 0;
+    
+    // deepseek-ocr (sam)
+    int32_t sam_n_layer = 0;
+    int32_t sam_n_head = 0;
+    int32_t sam_n_embd = 0;
+
+    // audio
+    int32_t n_mel_bins = 0; // whisper preprocessor
+    int32_t proj_stack_factor = 0; // ultravox
+
+    // legacy
+    bool has_llava_projector = false;
+    int minicpmv_version = 0;
+    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
+
+    // custom value provided by user, can be undefined if not set
+    int32_t custom_image_min_tokens = -1;
+    int32_t custom_image_max_tokens = -1;
+
+    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
+        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
+        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+    }
+
+    void set_warmup_n_tokens(int n_tokens) {
+        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+        // TODO: support warmup size for custom token numbers
+    }
+    // sam vit deepseek-ocr
+    std::vector<int32_t> global_attn_indices() const {
+        return {  2,  5,  8, 11 };
+    }
+    bool is_global_attn(int32_t layer) const {
+        const auto indices = global_attn_indices();
+
+        for (const auto & idx : indices) {
+            if (layer == idx) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+};
+
+struct clip_layer {
+    // attention
+    ggml_tensor * k_w = nullptr;
+    ggml_tensor * k_b = nullptr;
+    ggml_tensor * q_w = nullptr;
+    ggml_tensor * q_b = nullptr;
+    ggml_tensor * v_w = nullptr;
+    ggml_tensor * v_b = nullptr;
+    ggml_tensor * qkv_w = nullptr;
+    ggml_tensor * qkv_b = nullptr;
+
+    ggml_tensor * o_w = nullptr;
+    ggml_tensor * o_b = nullptr;
+
+    ggml_tensor * k_norm = nullptr;
+    ggml_tensor * q_norm = nullptr;
+
+    // layernorm 1
+    ggml_tensor * ln_1_w = nullptr;
+    ggml_tensor * ln_1_b = nullptr;
+
+    ggml_tensor * ff_up_w = nullptr;
+    ggml_tensor * ff_up_b = nullptr;
+    ggml_tensor * ff_gate_w = nullptr;
+    ggml_tensor * ff_gate_b = nullptr;
+    ggml_tensor * ff_down_w = nullptr;
+    ggml_tensor * ff_down_b = nullptr;
+
+    // layernorm 2
+    ggml_tensor * ln_2_w = nullptr;
+    ggml_tensor * ln_2_b = nullptr;
+
+    // layer scale (no bias)
+    ggml_tensor * ls_1_w = nullptr;
+    ggml_tensor * ls_2_w = nullptr;
+
+    // qwen3vl deepstack merger
+    ggml_tensor * deepstack_norm_w = nullptr;
+    ggml_tensor * deepstack_norm_b = nullptr;
+    ggml_tensor * deepstack_fc1_w = nullptr;
+    ggml_tensor * deepstack_fc1_b = nullptr;
+    ggml_tensor * deepstack_fc2_w = nullptr;
+    ggml_tensor * deepstack_fc2_b = nullptr;
+    
+    // sam rel_pos
+    ggml_tensor * rel_pos_w = nullptr;
+    ggml_tensor * rel_pos_h = nullptr;
+
+    bool has_deepstack() const {
+        return deepstack_fc1_w != nullptr;
+    }
+};
+
+struct clip_model {
+    clip_modality modality = CLIP_MODALITY_VISION;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+    clip_hparams hparams;
+
+    // embeddings
+    ggml_tensor * class_embedding = nullptr;
+    ggml_tensor * patch_embeddings_0 = nullptr;
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_bias = nullptr;
+    ggml_tensor * position_embeddings = nullptr;
+
+    ggml_tensor * pre_ln_w = nullptr;
+    ggml_tensor * pre_ln_b = nullptr;
+
+    std::vector<clip_layer> layers;
+
+    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
+
+    ggml_tensor * post_ln_w;
+    ggml_tensor * post_ln_b;
+
+    ggml_tensor * fc_w;
+    ggml_tensor * fc_b;
+    ggml_tensor * mm_fc_w;
+    ggml_tensor * mm_fc_b;
+
+    // LLaVA projection
+    ggml_tensor * mm_input_norm_w = nullptr;
+    ggml_tensor * mm_input_norm_b = nullptr;
+    ggml_tensor * mm_0_w = nullptr;
+    ggml_tensor * mm_0_b = nullptr;
+    ggml_tensor * mm_2_w = nullptr;
+    ggml_tensor * mm_2_b = nullptr;
+
+    ggml_tensor * image_newline = nullptr;
+    ggml_tensor * view_seperator = nullptr;
+
+
+    // Yi type models with mlp+normalization projection
+    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
+    ggml_tensor * mm_1_b = nullptr;
+    ggml_tensor * mm_3_w = nullptr;
+    ggml_tensor * mm_3_b = nullptr;
+    ggml_tensor * mm_4_w = nullptr;
+    ggml_tensor * mm_4_b = nullptr;
+
+    // GLMV-Edge projection
+    ggml_tensor * mm_model_adapter_conv_w = nullptr;
+    ggml_tensor * mm_model_adapter_conv_b = nullptr;
+
+    // MobileVLM projection
+    ggml_tensor * mm_model_mlp_1_w = nullptr;
+    ggml_tensor * mm_model_mlp_1_b = nullptr;
+    ggml_tensor * mm_model_mlp_3_w = nullptr;
+    ggml_tensor * mm_model_mlp_3_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
+
+    // MobileVLM_V2 projection
+    ggml_tensor * mm_model_mlp_0_w = nullptr;
+    ggml_tensor * mm_model_mlp_0_b = nullptr;
+    ggml_tensor * mm_model_mlp_2_w = nullptr;
+    ggml_tensor * mm_model_mlp_2_b = nullptr;
+    ggml_tensor * mm_model_peg_0_w = nullptr;
+    ggml_tensor * mm_model_peg_0_b = nullptr;
+
+    // MINICPMV projection
+    ggml_tensor * mm_model_pos_embed_k = nullptr;
+    ggml_tensor * mm_model_query = nullptr;
+    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_kv_proj = nullptr;
+    ggml_tensor * mm_model_attn_q_w = nullptr;
+    ggml_tensor * mm_model_attn_q_b = nullptr;
+    ggml_tensor * mm_model_attn_k_w = nullptr;
+    ggml_tensor * mm_model_attn_k_b = nullptr;
+    ggml_tensor * mm_model_attn_v_w = nullptr;
+    ggml_tensor * mm_model_attn_v_b = nullptr;
+    ggml_tensor * mm_model_attn_o_w = nullptr;
+    ggml_tensor * mm_model_attn_o_b = nullptr;
+    ggml_tensor * mm_model_ln_q_w = nullptr;
+    ggml_tensor * mm_model_ln_q_b = nullptr;
+    ggml_tensor * mm_model_ln_kv_w = nullptr;
+    ggml_tensor * mm_model_ln_kv_b = nullptr;
+    ggml_tensor * mm_model_ln_post_w = nullptr;
+    ggml_tensor * mm_model_ln_post_b = nullptr;
+
+    // gemma3
+    ggml_tensor * mm_input_proj_w = nullptr;
+    ggml_tensor * mm_soft_emb_norm_w = nullptr;
+
+    // pixtral
+    ggml_tensor * token_embd_img_break = nullptr;
+    ggml_tensor * mm_patch_merger_w = nullptr;
+
+    // ultravox / whisper encoder
+    ggml_tensor * conv1d_1_w = nullptr;
+    ggml_tensor * conv1d_1_b = nullptr;
+    ggml_tensor * conv1d_2_w = nullptr;
+    ggml_tensor * conv1d_2_b = nullptr;
+    ggml_tensor * mm_norm_pre_w = nullptr;
+    ggml_tensor * mm_norm_mid_w = nullptr;
+
+    // cogvlm
+    ggml_tensor * mm_post_fc_norm_w = nullptr;
+    ggml_tensor * mm_post_fc_norm_b = nullptr;
+    ggml_tensor * mm_h_to_4h_w = nullptr;
+    ggml_tensor * mm_gate_w = nullptr;
+    ggml_tensor * mm_4h_to_h_w = nullptr;
+    ggml_tensor * mm_boi = nullptr;
+    ggml_tensor * mm_eoi = nullptr;
+    
+    // deepseek ocr sam
+    ggml_tensor * patch_embed_proj_w = nullptr;
+    ggml_tensor * patch_embed_proj_b = nullptr;
+    ggml_tensor * pos_embed = nullptr;
+
+    ggml_tensor * neck_0_w;
+    ggml_tensor * neck_1_w;
+    ggml_tensor * neck_1_b;
+    ggml_tensor * neck_2_w;
+    ggml_tensor * neck_3_w;
+    ggml_tensor * neck_3_b;
+    ggml_tensor * net_2;
+    ggml_tensor * net_3;
+
+    int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
+
+    std::vector<clip_layer> sam_layers;
+
+    bool audio_has_avgpool() const {
+        return proj_type == PROJECTOR_TYPE_QWEN2A
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
+    }
+
+    bool audio_has_stack_frames() const {
+        return proj_type == PROJECTOR_TYPE_ULTRAVOX
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
+    }
+};
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 10324e165a..507c2d3407 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1,9 +1,9 @@
-// NOTE: This is modified from clip.cpp only for LLaVA,
-// so there might be still unnecessary artifacts hanging around
-// I'll gradually clean and extend it
-// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "clip-impl.h"
+#include "clip-model.h"
+#include "clip-graph.h"
+#include "models/models.h"
+
 #include "ggml.h"
 #include "ggml-cpp.h"
 #include "ggml-alloc.h"
@@ -27,18 +27,6 @@
 
 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
 
-enum ffn_op_type {
-    FFN_GELU,
-    FFN_GELU_ERF,
-    FFN_SILU,
-    FFN_GELU_QUICK,
-};
-
-enum norm_type {
-    NORM_TYPE_NORMAL,
-    NORM_TYPE_RMS,
-};
-
 //#define CLIP_DEBUG_FUNCTIONS
 
 #ifdef CLIP_DEBUG_FUNCTIONS
@@ -150,313 +138,6 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
 #endif
 
 
-//
-// clip layers
-//
-
-enum patch_merge_type {
-    PATCH_MERGE_FLAT,
-    PATCH_MERGE_SPATIAL_UNPAD,
-};
-
-struct clip_hparams {
-    int32_t image_size = 0;
-    int32_t patch_size = 0;
-    int32_t n_embd = 0;
-    int32_t n_ff = 0;
-    int32_t projection_dim = 0;
-    int32_t n_head = 0;
-    int32_t n_layer = 0;
-    // idefics3
-    int32_t image_longest_edge = 0;
-    int32_t image_min_pixels = -1;
-    int32_t image_max_pixels = -1;
-    int32_t n_merge = 0; // number of patch merges **per-side**
-
-    float image_mean[3];
-    float image_std[3];
-
-    // for models using dynamic image size, we need to have a smaller image size to warmup
-    // otherwise, user will get OOM everytime they load the model
-    int32_t warmup_image_size = 0;
-    int32_t warmup_audio_size = 3000;
-
-    ffn_op_type ffn_op = FFN_GELU;
-
-    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
-
-    float eps = 1e-6;
-    float rope_theta = 0.0;
-
-    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
-    int32_t image_crop_resolution;
-    std::unordered_set<int32_t> vision_feature_layer;
-    int32_t attn_window_size = 0;
-    int32_t n_wa_pattern = 0;
-
-    // deepseek-ocr (sam)
-    int32_t sam_n_layer = 0;
-    int32_t sam_n_head = 0;
-    int32_t sam_n_embd = 0;
-
-    // audio
-    int32_t n_mel_bins = 0; // whisper preprocessor
-    int32_t proj_stack_factor = 0; // ultravox
-
-    // legacy
-    bool has_llava_projector = false;
-    int minicpmv_version = 0;
-    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
-
-    // custom value provided by user, can be undefined if not set
-    int32_t custom_image_min_tokens = -1;
-    int32_t custom_image_max_tokens = -1;
-
-    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
-        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
-        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
-        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
-    }
-
-    void set_warmup_n_tokens(int n_tokens) {
-        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
-        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
-        // TODO: support warmup size for custom token numbers
-    }
-
-    // sam vit deepseek-ocr
-    std::vector<int32_t> global_attn_indices() const {
-            return {  2,  5,  8, 11 };
-    }
-
-    bool is_global_attn(int32_t layer) const {
-        const auto indices = global_attn_indices();
-
-        for (const auto & idx : indices) {
-            if (layer == idx) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-};
-
-struct clip_layer {
-    // attention
-    ggml_tensor * k_w = nullptr;
-    ggml_tensor * k_b = nullptr;
-    ggml_tensor * q_w = nullptr;
-    ggml_tensor * q_b = nullptr;
-    ggml_tensor * v_w = nullptr;
-    ggml_tensor * v_b = nullptr;
-    ggml_tensor * qkv_w = nullptr;
-    ggml_tensor * qkv_b = nullptr;
-
-    ggml_tensor * o_w = nullptr;
-    ggml_tensor * o_b = nullptr;
-
-    ggml_tensor * k_norm = nullptr;
-    ggml_tensor * q_norm = nullptr;
-
-    // layernorm 1
-    ggml_tensor * ln_1_w = nullptr;
-    ggml_tensor * ln_1_b = nullptr;
-
-    ggml_tensor * ff_up_w = nullptr;
-    ggml_tensor * ff_up_b = nullptr;
-    ggml_tensor * ff_gate_w = nullptr;
-    ggml_tensor * ff_gate_b = nullptr;
-    ggml_tensor * ff_down_w = nullptr;
-    ggml_tensor * ff_down_b = nullptr;
-
-    // layernorm 2
-    ggml_tensor * ln_2_w = nullptr;
-    ggml_tensor * ln_2_b = nullptr;
-
-    // layer scale (no bias)
-    ggml_tensor * ls_1_w = nullptr;
-    ggml_tensor * ls_2_w = nullptr;
-
-    // qwen3vl deepstack merger
-    ggml_tensor * deepstack_norm_w = nullptr;
-    ggml_tensor * deepstack_norm_b = nullptr;
-    ggml_tensor * deepstack_fc1_w = nullptr;
-    ggml_tensor * deepstack_fc1_b = nullptr;
-    ggml_tensor * deepstack_fc2_w = nullptr;
-    ggml_tensor * deepstack_fc2_b = nullptr;
-
-    bool has_deepstack() const {
-        return deepstack_fc1_w != nullptr;
-    }
-
-    // sam rel_pos
-    ggml_tensor * rel_pos_w = nullptr;
-    ggml_tensor * rel_pos_h = nullptr;
-};
-
-struct clip_model {
-    clip_modality modality = CLIP_MODALITY_VISION;
-    projector_type proj_type = PROJECTOR_TYPE_MLP;
-    clip_hparams hparams;
-
-    // embeddings
-    ggml_tensor * class_embedding = nullptr;
-    ggml_tensor * patch_embeddings_0 = nullptr;
-    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
-    ggml_tensor * patch_bias = nullptr;
-    ggml_tensor * position_embeddings = nullptr;
-
-    ggml_tensor * pre_ln_w = nullptr;
-    ggml_tensor * pre_ln_b = nullptr;
-
-    std::vector<clip_layer> layers;
-
-    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
-
-    ggml_tensor * post_ln_w;
-    ggml_tensor * post_ln_b;
-
-    ggml_tensor * fc_w;
-    ggml_tensor * fc_b;
-    ggml_tensor * mm_fc_w;
-    ggml_tensor * mm_fc_b;
-
-    // LLaVA projection
-    ggml_tensor * mm_input_norm_w = nullptr;
-    ggml_tensor * mm_input_norm_b = nullptr;
-    ggml_tensor * mm_0_w = nullptr;
-    ggml_tensor * mm_0_b = nullptr;
-    ggml_tensor * mm_2_w = nullptr;
-    ggml_tensor * mm_2_b = nullptr;
-
-    ggml_tensor * image_newline = nullptr;
-    ggml_tensor * view_seperator = nullptr;
-
-    // Yi type models with mlp+normalization projection
-    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
-    ggml_tensor * mm_1_b = nullptr;
-    ggml_tensor * mm_3_w = nullptr;
-    ggml_tensor * mm_3_b = nullptr;
-    ggml_tensor * mm_4_w = nullptr;
-    ggml_tensor * mm_4_b = nullptr;
-
-    // GLMV-Edge projection
-    ggml_tensor * mm_model_adapter_conv_w = nullptr;
-    ggml_tensor * mm_model_adapter_conv_b = nullptr;
-
-    // MobileVLM projection
-    ggml_tensor * mm_model_mlp_1_w = nullptr;
-    ggml_tensor * mm_model_mlp_1_b = nullptr;
-    ggml_tensor * mm_model_mlp_3_w = nullptr;
-    ggml_tensor * mm_model_mlp_3_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
-
-    // MobileVLM_V2 projection
-    ggml_tensor * mm_model_mlp_0_w = nullptr;
-    ggml_tensor * mm_model_mlp_0_b = nullptr;
-    ggml_tensor * mm_model_mlp_2_w = nullptr;
-    ggml_tensor * mm_model_mlp_2_b = nullptr;
-    ggml_tensor * mm_model_peg_0_w = nullptr;
-    ggml_tensor * mm_model_peg_0_b = nullptr;
-
-    // MINICPMV projection
-    ggml_tensor * mm_model_pos_embed_k = nullptr;
-    ggml_tensor * mm_model_query = nullptr;
-    ggml_tensor * mm_model_proj = nullptr;
-    ggml_tensor * mm_model_kv_proj = nullptr;
-    ggml_tensor * mm_model_attn_q_w = nullptr;
-    ggml_tensor * mm_model_attn_q_b = nullptr;
-    ggml_tensor * mm_model_attn_k_w = nullptr;
-    ggml_tensor * mm_model_attn_k_b = nullptr;
-    ggml_tensor * mm_model_attn_v_w = nullptr;
-    ggml_tensor * mm_model_attn_v_b = nullptr;
-    ggml_tensor * mm_model_attn_o_w = nullptr;
-    ggml_tensor * mm_model_attn_o_b = nullptr;
-    ggml_tensor * mm_model_ln_q_w = nullptr;
-    ggml_tensor * mm_model_ln_q_b = nullptr;
-    ggml_tensor * mm_model_ln_kv_w = nullptr;
-    ggml_tensor * mm_model_ln_kv_b = nullptr;
-    ggml_tensor * mm_model_ln_post_w = nullptr;
-    ggml_tensor * mm_model_ln_post_b = nullptr;
-
-    // gemma3
-    ggml_tensor * mm_input_proj_w = nullptr;
-    ggml_tensor * mm_soft_emb_norm_w = nullptr;
-
-    // pixtral
-    ggml_tensor * token_embd_img_break = nullptr;
-    ggml_tensor * mm_patch_merger_w = nullptr;
-
-    // ultravox / whisper encoder
-    ggml_tensor * conv1d_1_w = nullptr;
-    ggml_tensor * conv1d_1_b = nullptr;
-    ggml_tensor * conv1d_2_w = nullptr;
-    ggml_tensor * conv1d_2_b = nullptr;
-    ggml_tensor * mm_norm_pre_w = nullptr;
-    ggml_tensor * mm_norm_mid_w = nullptr;
-
-    // cogvlm
-    ggml_tensor * mm_post_fc_norm_w = nullptr;
-    ggml_tensor * mm_post_fc_norm_b = nullptr;
-    ggml_tensor * mm_h_to_4h_w = nullptr;
-    ggml_tensor * mm_gate_w = nullptr;
-    ggml_tensor * mm_4h_to_h_w = nullptr;
-    ggml_tensor * mm_boi = nullptr;
-    ggml_tensor * mm_eoi = nullptr;
-
-    // deepseek ocr sam
-    ggml_tensor * patch_embed_proj_w = nullptr;
-    ggml_tensor * patch_embed_proj_b = nullptr;
-    ggml_tensor * pos_embed = nullptr;
-
-    bool audio_has_avgpool() const {
-        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL;
-    }
-
-    bool audio_has_stack_frames() const {
-        return proj_type == PROJECTOR_TYPE_ULTRAVOX
-            || proj_type == PROJECTOR_TYPE_VOXTRAL;
-    }
-    ggml_tensor * neck_0_w;
-    ggml_tensor * neck_1_w;
-    ggml_tensor * neck_1_b;
-    ggml_tensor * neck_2_w;
-    ggml_tensor * neck_3_w;
-    ggml_tensor * neck_3_b;
-    ggml_tensor * net_2;
-    ggml_tensor * net_3;
-
-    int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
-
-    std::vector<clip_layer> sam_layers;
-
-};
-
 struct clip_ctx {
     clip_model model;
 
@@ -539,2431 +220,618 @@ struct clip_ctx {
     }
 };
 
-struct clip_graph {
-    clip_ctx * ctx;
-    const clip_model & model;
-    const clip_hparams & hparams;
+//
+// clip_graph
+//
 
-    // we only support single image per batch
-    const clip_image_f32 & img;
+clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
+        model(ctx->model),
+        hparams(model.hparams),
+        proj_type(ctx->proj_type()),
+        img(img),
+        patch_size(hparams.patch_size),
+        n_patches_x(img.nx / patch_size),
+        n_patches_y(img.ny / patch_size),
+        n_patches(n_patches_x * n_patches_y),
+        n_embd(hparams.n_embd),
+        n_head(hparams.n_head),
+        d_head(n_embd / n_head),
+        n_layer(hparams.n_layer),
+        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
+        eps(hparams.eps),
+        kq_scale(1.0f / sqrtf((float)d_head)),
+        flash_attn_type(ctx->flash_attn_type),
+        debug_graph(ctx->debug_graph),
+        debug_print_tensors(ctx->debug_print_tensors) {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+    ctx0_ptr.reset(ggml_init(params));
+    ctx0 = ctx0_ptr.get();
+    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
+}
 
-    const int patch_size;
-    const int n_patches_x;
-    const int n_patches_y;
-    const int n_patches;
-    const int n_embd;
-    const int n_head;
-    const int d_head;
-    const int n_layer;
-    const float eps;
-    const float kq_scale;
-
-    ggml_context_ptr ctx0_ptr;
-    ggml_context * ctx0;
-    ggml_cgraph * gf;
-
-    clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
-            ctx(ctx),
-            model(ctx->model),
-            hparams(model.hparams),
-            img(img),
-            patch_size(hparams.patch_size),
-            n_patches_x(img.nx / patch_size),
-            n_patches_y(img.ny / patch_size),
-            n_patches(n_patches_x * n_patches_y),
-            n_embd(hparams.n_embd),
-            n_head(hparams.n_head),
-            d_head(n_embd / n_head),
-            n_layer(hparams.n_layer),
-            eps(hparams.eps),
-            kq_scale(1.0f / sqrtf((float)d_head)) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-            /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-            /*.no_alloc   =*/ true,
-        };
-        ctx0_ptr.reset(ggml_init(params));
-        ctx0 = ctx0_ptr.get();
-        gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
-    }
-
-    ggml_cgraph * build_siglip() {
-        ggml_tensor * inp = build_inp();
-
-        ggml_tensor * learned_pos_embd = model.position_embeddings;
-        if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
-            learned_pos_embd = resize_position_embeddings();
-        }
-
-        ggml_tensor * cur = build_vit(
-                                inp, n_patches,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                learned_pos_embd,
-                                nullptr);
-
-        if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
-            const int batch_size = 1;
-            GGML_ASSERT(n_patches_x == n_patches_y);
-            const int patches_per_image = n_patches_x;
-            const int kernel_size = hparams.n_merge;
-
-            cur = ggml_transpose(ctx0, cur);
-            cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
-
-            // doing a pool2d to reduce the number of output tokens
-            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
-            cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-            // apply norm before projection
-            cur = ggml_rms_norm(ctx0, cur, eps);
-            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
-
-            // apply projection
-            cur = ggml_mul_mat(ctx0,
-                ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
-                cur);
-
-        } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
-            // pixel_shuffle
-            // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
-            const int scale_factor = model.hparams.n_merge;
-            cur = build_patch_merge_permute(cur, scale_factor);
-            cur = ggml_mul_mat(ctx0, model.fc_w, cur);
-
-        } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
-            // pixel unshuffle block
-            const int scale_factor = model.hparams.n_merge;
-            cur = build_patch_merge_permute(cur, scale_factor);
-
-            // projection
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-
-            cur = build_ffn(cur,
-                model.mm_1_w, model.mm_1_b,
-                nullptr, nullptr,
-                model.mm_2_w, model.mm_2_b,
-                FFN_GELU,
-                -1);
-
-        } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
-            cur = build_ffn(cur,
-                model.mm_0_w, model.mm_0_b,
-                nullptr, nullptr,
-                model.mm_1_w, model.mm_1_b,
-                hparams.ffn_op,
-                -1);
-
-        } else {
-            GGML_ABORT("SigLIP: Unsupported projector type");
-        }
-
-        // build the graph
+void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
+    if (debug_graph) {
+        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+        ggml_set_name(cur, cur_name.c_str());
+        ggml_set_output(cur);
         ggml_build_forward_expand(gf, cur);
+        debug_print_tensors.push_back(cur);
+    }
+}
 
-        return gf;
+// siglip2 naflex
+ggml_tensor * clip_graph::resize_position_embeddings() {
+    ggml_tensor * pos_embd = model.position_embeddings;
+    const int height       = img.ny / patch_size;
+    const int width        = img.nx / patch_size;
+    const uint32_t mode    = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
+    const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
+
+    GGML_ASSERT(pos_embd);
+
+    if (height == n_per_side && width == n_per_side) {
+        return pos_embd;
     }
 
-    ggml_cgraph * build_deepseek_ocr() {
-        //patch embedding
-        ggml_tensor * inp_raw  = build_inp_raw();
-        ggml_tensor * sam_out  = build_sam(inp_raw);
-        ggml_tensor * clip_out = build_dsocr_clip(sam_out);
-
-        int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
-
-        sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
-        sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
-        clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
-
-        ggml_tensor * cur;
-        cur = ggml_concat(ctx0, clip_out, sam_out, 0);
-        cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
-        cur = ggml_cont(ctx0, cur);
-        cur = ggml_mul_mat(ctx0, model.fc_w, cur);
-        cur = ggml_add(ctx0, cur, model.fc_b);
-
-        const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
-        const auto w = h;
-        const auto n_dim = cur->ne[0];
-
-        ggml_tensor * imgnl;
-        ggml_tensor * vs;
-
-        imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-        vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
-        cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
-        cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-        cur   = ggml_concat(ctx0, cur, vs, 1);  // (n_dim, h*(w+1) + 1)
-
-        cb(cur, "dsocr_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-        return gf;
-    }
-
-    ggml_cgraph * build_pixtral() {
-        const int n_merge = hparams.n_merge;
-
-        // 2D input positions
-        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-        ggml_set_name(pos_h, "pos_h");
-        ggml_set_input(pos_h);
-
-        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-        ggml_set_name(pos_w, "pos_w");
-        ggml_set_input(pos_w);
-
-        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
-        };
-
-        ggml_tensor * inp = build_inp();
-        ggml_tensor * cur = build_vit(
-                                inp, n_patches,
-                                NORM_TYPE_RMS,
-                                hparams.ffn_op,
-                                nullptr, // no learned pos embd
-                                add_pos);
-
-        // mistral small 3.1 patch merger
-        // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
-        if (model.mm_patch_merger_w) {
-            GGML_ASSERT(hparams.n_merge > 0);
-
-            cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
-
-            // reshape image tokens to 2D grid
-            cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
-            cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
-            cur = ggml_cont(ctx0, cur);
-
-            // torch.nn.functional.unfold is just an im2col under the hood
-            // we just need a dummy kernel to make it work
-            ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
-            cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
-
-            // project to n_embd
-            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
-            cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
-        }
-
-        // LlavaMultiModalProjector (always using GELU activation)
-        {
-            cur = build_ffn(cur,
-                model.mm_1_w, model.mm_1_b,
-                nullptr, nullptr,
-                model.mm_2_w, model.mm_2_b,
-                FFN_GELU,
-                -1);
-        }
-
-        // arrangement of the [IMG_BREAK] token
-        if (model.token_embd_img_break) {
-            // not efficient, but works
-            // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
-            // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
-            // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
-
-            const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
-            const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
-            const int p_total         = p_x * p_y;
-            const int n_embd_text     = cur->ne[0];
-            const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
-
-            ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
-            ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
-            tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
-            tok = ggml_add(ctx0, tok, model.token_embd_img_break);
-            tmp = ggml_concat(ctx0, tmp, tok, 1);
-            cur = ggml_view_2d(ctx0, tmp,
-                n_embd_text, n_tokens_output,
-                ggml_row_size(tmp->type, n_embd_text), 0);
-        }
-
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // Qwen2VL and Qwen2.5VL use M-RoPE
-    ggml_cgraph * build_qwen2vl() {
-        GGML_ASSERT(model.patch_bias == nullptr);
-        GGML_ASSERT(model.class_embedding == nullptr);
-
-        const int batch_size       = 1;
-        const bool use_window_attn = hparams.n_wa_pattern > 0;
-        const int n_wa_pattern     = hparams.n_wa_pattern;
-        const int n_pos            = n_patches;
-        const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
-
-        norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
-            ? NORM_TYPE_RMS // qwen 2.5 vl
-            : NORM_TYPE_NORMAL; // qwen 2 vl
-
-        int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-        ggml_tensor * inp_raw = build_inp_raw();
-        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-        GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-        GGML_ASSERT(img.ny % (patch_size * 2) == 0);
-
-        // second conv dimension
-        {
-            auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-            inp = ggml_add(ctx0, inp, inp_1);
-
-            inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
-            inp = ggml_cont_4d(
-                ctx0, inp,
-                n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
-            inp = ggml_reshape_4d(
-                ctx0, inp,
-                n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
-            inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-            inp = ggml_cont_3d(
-                ctx0, inp,
-                n_embd, n_patches_x * n_patches_y, batch_size);
-        }
-
-        ggml_tensor * inpL           = inp;
-        ggml_tensor * window_mask    = nullptr;
-        ggml_tensor * window_idx     = nullptr;
-        ggml_tensor * inv_window_idx = nullptr;
-
-        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-        ggml_set_name(positions, "positions");
-        ggml_set_input(positions);
-
-        // pre-layernorm
-        if (model.pre_ln_w) {
-            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-        }
-
-        if (use_window_attn) {
-            // handle window attention inputs
-            inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-            ggml_set_name(inv_window_idx, "inv_window_idx");
-            ggml_set_input(inv_window_idx);
-            // mask for window attention
-            window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
-            ggml_set_name(window_mask, "window_mask");
-            ggml_set_input(window_mask);
-
-            // if flash attn is used, we need to pad the mask and cast to f16
-            if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-                window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
-            }
-
-            // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-            GGML_ASSERT(batch_size == 1);
-            inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
-            inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
-            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
-        }
-
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            const auto & layer = model.layers[il];
-            const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
-
-            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-            cb(cur, "ln1", il);
-
-            // self-attention
-            {
-                ggml_tensor * Qcur = ggml_add(ctx0,
-                    ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
-                ggml_tensor * Kcur = ggml_add(ctx0,
-                    ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
-                ggml_tensor * Vcur = ggml_add(ctx0,
-                    ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                // apply M-RoPE
-                Qcur = ggml_rope_multi(
-                    ctx0, Qcur, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-                Kcur = ggml_rope_multi(
-                    ctx0, Kcur, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-                cb(Qcur, "Qcur_rope", il);
-                cb(Kcur, "Kcur_rope", il);
-
-                ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
-
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
-                cb(cur, "attn_out", il);
-            }
-
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, inpL);
-
-            inpL = cur; // inpL = residual, cur = hidden_states
-
-            cb(cur, "ffn_inp", il);
-
-            // layernorm2
-            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-            cb(cur, "ffn_inp_normed", il);
-
-            // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
-
-            cb(cur, "ffn_out", il);
-
-            // residual 2
-            cur = ggml_add(ctx0, inpL, cur);
-            cb(cur, "layer_out", il);
-
-            inpL = cur;
-        }
-
-        // post-layernorm
-        if (model.post_ln_w) {
-            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-        }
-
-        // multimodal projection
-        ggml_tensor * embeddings = inpL;
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-        embeddings = build_ffn(embeddings,
-                            model.mm_0_w, model.mm_0_b,
-                            nullptr, nullptr,
-                            model.mm_1_w, model.mm_1_b,
-                            FFN_GELU,
-                            -1);
-
-        if (use_window_attn) {
-            window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-            ggml_set_name(window_idx, "window_idx");
-            ggml_set_input(window_idx);
-
-            // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-            GGML_ASSERT(batch_size == 1);
-            embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
-            embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-            embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
-        }
-
-        // build the graph
-        ggml_build_forward_expand(gf, embeddings);
-
-        return gf;
-    }
-
-    // Qwen3VL
-    ggml_cgraph * build_qwen3vl() {
-        GGML_ASSERT(model.patch_bias != nullptr);
-        GGML_ASSERT(model.position_embeddings != nullptr);
-        GGML_ASSERT(model.class_embedding == nullptr);
-
-        const int batch_size       = 1;
-        const int n_pos            = n_patches;
-        const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
-
-        norm_type norm_t = NORM_TYPE_NORMAL;
-
-        int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-        ggml_tensor * inp_raw = build_inp_raw();
-        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-        GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-        GGML_ASSERT(img.ny % (patch_size * 2) == 0);
-
-        // second conv dimension
-        {
-            auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-            inp = ggml_add(ctx0, inp, inp_1);
-
-            inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
-            inp = ggml_cont_4d(
-                ctx0, inp,
-                n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
-            inp = ggml_reshape_4d(
-                ctx0, inp,
-                n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
-            inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-            inp = ggml_cont_3d(
-                ctx0, inp,
-                n_embd, n_patches_x * n_patches_y, batch_size);
-        }
-
-        // add patch bias
-        if (model.patch_bias != nullptr) {
-            inp = ggml_add(ctx0, inp, model.patch_bias);
-            cb(inp, "patch_bias", -1);
-        }
-
-        // calculate absolute position embedding and apply
-        ggml_tensor * learned_pos_embd = resize_position_embeddings();
-        learned_pos_embd = ggml_cont_4d(
-            ctx0, learned_pos_embd,
-            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
-        learned_pos_embd = ggml_reshape_4d(
-            ctx0, learned_pos_embd,
-            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
-        learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
-        learned_pos_embd = ggml_cont_3d(
-            ctx0, learned_pos_embd,
-            n_embd, n_patches_x * n_patches_y, batch_size);
+    pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side);  // -> (n_embd, n_per_side, n_per_side)
+    pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3);                         // -> (n_per_side, n_per_side, n_embd)
+    pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
+    pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3);                         // -> (n_embd, width, height)
+    pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);             // -> (n_embd, width * height)
+
+    return pos_embd;
+}
+
+// build vision transformer (ViT) cgraph
+// this function should cover most of the models
+// if your model has specific features, you should probably duplicate this function
+ggml_tensor * clip_graph::build_vit(
+            ggml_tensor * inp,
+            int64_t n_pos,
+            norm_type norm_t,
+            ffn_op_type ffn_t,
+            ggml_tensor * learned_pos_embd,
+            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
+        ) {
+    if (learned_pos_embd) {
         inp = ggml_add(ctx0, inp, learned_pos_embd);
-        cb(inp, "inp_pos_emb", -1);
+        cb(inp, "pos_embed", -1);
+    }
 
-        ggml_tensor * inpL = inp;
+    ggml_tensor * inpL = inp;
 
-        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-        ggml_set_name(positions, "positions");
-        ggml_set_input(positions);
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+        cb(inpL, "pre_ln", -1);
+    }
 
-        // pre-layernorm
-        if (model.pre_ln_w) {
-            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-        }
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
 
-        // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
-        ggml_tensor * deepstack_features = nullptr;
-        const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "layer_inp_normed", il);
 
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.layers[il];
-
-            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-            cb(cur, "ln1", il);
-
-            // self-attention
-            {
+        // self-attention
+        {
+            ggml_tensor * Qcur = nullptr;
+            ggml_tensor * Kcur = nullptr;
+            ggml_tensor * Vcur = nullptr;
+            if (layer.qkv_w != nullptr) {
+                // fused qkv
                 cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-                cur = ggml_add(ctx0, cur, layer.qkv_b);
-
-                ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ 0);
-
-                ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ ggml_row_size(cur->type, n_embd));
-
-                ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ ggml_row_size(cur->type, 2 * n_embd));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                // apply M-RoPE
-                Qcur = ggml_rope_multi(
-                    ctx0, Qcur, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-                Kcur = ggml_rope_multi(
-                    ctx0, Kcur, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-                cb(Qcur, "Qcur_rope", il);
-                cb(Kcur, "Kcur_rope", il);
-
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
-                cb(cur, "attn_out", il);
-            }
-
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, inpL);
-
-            inpL = cur; // inpL = residual, cur = hidden_states
-
-            cb(cur, "ffn_inp", il);
-
-            // layernorm2
-            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-            cb(cur, "ffn_inp_normed", il);
-
-            // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
-
-            cb(cur, "ffn_out", il);
-
-            // residual 2
-            cur = ggml_add(ctx0, inpL, cur);
-            cb(cur, "layer_out", il);
-
-            if (layer.has_deepstack()) {
-                ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
-                feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
-                feat = build_ffn(feat,
-                    layer.deepstack_fc1_w, layer.deepstack_fc1_b,
-                    nullptr, nullptr,
-                    layer.deepstack_fc2_w, layer.deepstack_fc2_b,
-                    ffn_op_type::FFN_GELU, il);
-
-                if(!deepstack_features) {
-                    deepstack_features = feat;
-                } else {
-                    // concat along the feature dimension
-                    deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+                if (layer.qkv_b != nullptr) {
+                    cur = ggml_add(ctx0, cur, layer.qkv_b);
                 }
-            }
 
-            inpL = cur;
-        }
+                Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ 0);
 
-        // post-layernorm
-        if (model.post_ln_w) {
-            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-        }
+                Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, n_embd));
 
-        // multimodal projection
-        ggml_tensor * embeddings = inpL;
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+                Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
 
-        embeddings = build_ffn(embeddings,
-            model.mm_0_w, model.mm_0_b,
-            nullptr, nullptr,
-            model.mm_1_w, model.mm_1_b,
-            ffn_op_type::FFN_GELU, -1);
+                // TODO: q/k norm requires row size == n_embd, while here it's d_head
+                // we can add support in the future if needed
+                GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
 
-        embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
-
-        // build the graph
-        ggml_build_forward_expand(gf, embeddings);
-
-        return gf;
-    }
-
-    ggml_cgraph * build_minicpmv() {
-        GGML_ASSERT(model.class_embedding == nullptr);
-        const int n_pos       = n_patches;
-        const int n_embd_proj = clip_n_mmproj_embd(ctx);
-
-        // position embeddings for the projector (not for ViT)
-        // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
-        // base frequency omega
-        ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
-        ggml_set_name(omega, "omega");
-        ggml_set_input(omega);
-
-        // 2D input positions (using float for sinusoidal embeddings)
-        ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
-        ggml_set_name(pos_h, "pos_h");
-        ggml_set_input(pos_h);
-        ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
-        ggml_set_name(pos_w, "pos_w");
-        ggml_set_input(pos_w);
-
-        // for selecting learned pos embd, used by ViT
-        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(positions, "positions");
-        ggml_set_input(positions);
-
-        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
-
-        ggml_tensor * inp = build_inp();
-        ggml_tensor * embeddings = build_vit(
-                                inp, n_pos,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                learned_pos_embd,
-                                nullptr);
-
-        // resampler projector (it is just another transformer)
-
-        ggml_tensor * q = model.mm_model_query;
-        ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
-
-        // norm
-        q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
-        v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
-
-        // calculate sinusoidal pos embd
-        ggml_tensor * pos_embed = nullptr;
-        {
-            // outer product
-            ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
-            ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
-            ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
-            // sin and cos
-            ggml_tensor * pos_embd_x = ggml_concat(
-                ctx0,
-                ggml_sin(ctx0, theta_x),
-                ggml_cos(ctx0, theta_x),
-                0 // concat on first dim
-            );
-            ggml_tensor * pos_embd_y = ggml_concat(
-                ctx0,
-                ggml_sin(ctx0, theta_y),
-                ggml_cos(ctx0, theta_y),
-                0 // concat on first dim
-            );
-            pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
-        }
-
-        // k = v + pos_embed
-        ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
-
-        // attention
-        {
-            const int d_head = 128;
-            int n_head = n_embd_proj/d_head;
-            // Use actual config value if available, otherwise fall back to hardcoded values
-            int num_query = ctx->model.hparams.minicpmv_query_num;
-            ggml_tensor * Q = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
-                model.mm_model_attn_q_b);
-            ggml_tensor * K = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
-                model.mm_model_attn_k_b);
-            ggml_tensor * V = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
-                model.mm_model_attn_v_b);
-
-            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
-            K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
-            V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
-
-            cb(Q, "resampler_Q", -1);
-            cb(K, "resampler_K", -1);
-            cb(V, "resampler_V", -1);
-
-            float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
-            embeddings = build_attn(
-                model.mm_model_attn_o_w,
-                model.mm_model_attn_o_b,
-                Q, K, V, nullptr, resampler_kq_scale, -1);
-            cb(embeddings, "resampler_attn_out", -1);
-        }
-        // layernorm
-        embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
-
-        // projection
-        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
-
-        // build the graph
-        ggml_build_forward_expand(gf, embeddings);
-
-        return gf;
-    }
-
-    ggml_cgraph * build_internvl() {
-        GGML_ASSERT(model.class_embedding != nullptr);
-        GGML_ASSERT(model.position_embeddings != nullptr);
-
-        const int n_pos = n_patches + 1;
-        ggml_tensor * inp = build_inp();
-
-        // add CLS token
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-        // The larger models use a different ViT, which uses RMS norm instead of layer norm
-        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
-        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
-            ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
-            : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
-
-        ggml_tensor * cur = build_vit(
-                                inp, n_pos,
-                                norm_t,
-                                hparams.ffn_op,
-                                model.position_embeddings,
-                                nullptr);
-
-        // remove CLS token
-        cur = ggml_view_2d(ctx0, cur,
-            n_embd, n_patches,
-            ggml_row_size(cur->type, n_embd), 0);
-
-        // pixel shuffle
-        {
-            const int scale_factor = model.hparams.n_merge;
-            const int bsz    = 1; // batch size, always 1 for now since we don't support batching
-            const int height = n_patches_y;
-            const int width  = n_patches_x;
-            GGML_ASSERT(scale_factor > 0);
-            cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_cont_4d(ctx0, cur,
-                n_embd * scale_factor * scale_factor,
-                height / scale_factor,
-                width / scale_factor,
-                bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            // flatten to 2D
-            cur = ggml_cont_2d(ctx0, cur,
-                n_embd * scale_factor * scale_factor,
-                cur->ne[1] * cur->ne[2]);
-        }
-
-        // projector (always using GELU activation)
-        {
-            // projector LayerNorm uses pytorch's default eps = 1e-5
-            // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
-            cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-            cur = build_ffn(cur,
-                model.mm_1_w, model.mm_1_b,
-                nullptr, nullptr,
-                model.mm_3_w, model.mm_3_b,
-                FFN_GELU,
-                -1);
-        }
-
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    ggml_cgraph * build_llama4() {
-        GGML_ASSERT(model.class_embedding != nullptr);
-        GGML_ASSERT(model.position_embeddings != nullptr);
-
-        const int n_pos = n_patches + 1; // +1 for [CLS]
-
-        // 2D input positions
-        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(pos_h, "pos_h");
-        ggml_set_input(pos_h);
-
-        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(pos_w, "pos_w");
-        ggml_set_input(pos_w);
-
-        ggml_tensor * inp = build_inp_raw();
-
-        // Llama4UnfoldConvolution
-        {
-            ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
-                                                    patch_size, patch_size, 3, n_embd);
-            inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
-            inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-            inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-            cb(inp, "patch_conv", -1);
-        }
-
-        // add CLS token
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-        // build ViT with 2D position embeddings
-        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            // first half is X axis and second half is Y axis
-            // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
-            // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
-            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
-        };
-        ggml_tensor * cur = build_vit(
-                                inp, n_pos,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                model.position_embeddings,
-                                add_pos);
-
-        // remove CLS token
-        cur = ggml_view_2d(ctx0, cur,
-            n_embd, n_patches,
-            ggml_row_size(cur->type, n_embd), 0);
-
-        // pixel shuffle
-        // based on Llama4VisionPixelShuffleMLP
-        // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
-        {
-            const int scale_factor = model.hparams.n_merge;
-            const int bsz = 1; // batch size, always 1 for now since we don't support batching
-            GGML_ASSERT(scale_factor > 0);
-            GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
-            cur = ggml_reshape_4d(ctx0, cur,
-                n_embd * scale_factor,
-                n_patches_x / scale_factor,
-                n_patches_y,
-                bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_cont_4d(ctx0, cur,
-                n_embd * scale_factor * scale_factor,
-                n_patches_x / scale_factor,
-                n_patches_y / scale_factor,
-                bsz);
-            //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            // flatten to 2D
-            cur = ggml_cont_2d(ctx0, cur,
-                n_embd * scale_factor * scale_factor,
-                n_patches / scale_factor / scale_factor);
-            cb(cur, "pixel_shuffle", -1);
-        }
-
-        // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
-        {
-            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "adapter_mlp", -1);
-        }
-
-        // Llama4MultiModalProjector
-        cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
-        cb(cur, "projected", -1);
-
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    ggml_cgraph * build_kimivl() {
-        // 2D input positions
-        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-        ggml_set_name(pos_h, "pos_h");
-        ggml_set_input(pos_h);
-
-        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-        ggml_set_name(pos_w, "pos_w");
-        ggml_set_input(pos_w);
-
-        ggml_tensor * learned_pos_embd = resize_position_embeddings();
-
-        // build ViT with 2D position embeddings
-        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            // first half is X axis and second half is Y axis
-            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
-        };
-
-        ggml_tensor * inp = build_inp();
-        ggml_tensor * cur = build_vit(
-                                inp, n_patches,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                learned_pos_embd,
-                                add_pos);
-
-        cb(cur, "vit_out", -1);
-
-        {
-            // patch_merger
-            const int scale_factor = model.hparams.n_merge;
-            cur = build_patch_merge_permute(cur, scale_factor);
-
-            // projection norm
-            int proj_inp_dim = cur->ne[0];
-            cur = ggml_view_2d(ctx0, cur,
-                n_embd, cur->ne[1] * scale_factor * scale_factor,
-                ggml_row_size(cur->type, n_embd), 0);
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-            cur = ggml_view_2d(ctx0, cur,
-                proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
-                ggml_row_size(cur->type, proj_inp_dim), 0);
-            cb(cur, "proj_inp_normed", -1);
-
-            // projection mlp
-            cur = build_ffn(cur,
-                model.mm_1_w, model.mm_1_b,
-                nullptr, nullptr,
-                model.mm_2_w, model.mm_2_b,
-                FFN_GELU,
-                -1);
-            cb(cur, "proj_out", -1);
-        }
-
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // this graph is used by llava, granite and glm
-    // due to having embedding_stack (used by granite), we cannot reuse build_vit
-    ggml_cgraph * build_llava() {
-        const int batch_size = 1;
-        const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
-
-        GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
-
-        // Calculate the deepest feature layer based on hparams and projector type
-        int max_feature_layer = n_layer;
-        {
-            // Get the index of the second to last layer; this is the default for models that have a llava projector
-            int il_last = hparams.n_layer - 1;
-            int deepest_feature_layer = -1;
-
-            if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
-                il_last += 1;
-            }
-
-            // If we set explicit vision feature layers, only go up to the deepest one
-            // NOTE: only used by granite-vision models for now
-            for (const auto & feature_layer : hparams.vision_feature_layer) {
-                if (feature_layer > deepest_feature_layer) {
-                    deepest_feature_layer = feature_layer;
-                }
-            }
-            max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
-        }
-
-        ggml_tensor * inp = build_inp();
-
-        // concat class_embeddings and patch_embeddings
-        if (model.class_embedding) {
-            inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-        }
-
-        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(positions, "positions");
-        ggml_set_input(positions);
-
-        inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
-
-        ggml_tensor * inpL = inp;
-
-        // pre-layernorm
-        if (model.pre_ln_w) {
-            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
-            cb(inpL, "pre_ln", -1);
-        }
-
-        std::vector<ggml_tensor *> embedding_stack;
-        const auto & vision_feature_layer = hparams.vision_feature_layer;
-
-        // loop over layers
-        for (int il = 0; il < max_feature_layer; il++) {
-            auto & layer = model.layers[il];
-            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-            // If this is an embedding feature layer, save the output.
-            // NOTE: 0 index here refers to the input to the encoder.
-            if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
-                embedding_stack.push_back(cur);
-            }
-
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
-            cb(cur, "layer_inp_normed", il);
-
-            // self-attention
-            {
-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+            } else {
+                // separate q, k, v
+                Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
                 if (layer.q_b) {
                     Qcur = ggml_add(ctx0, Qcur, layer.q_b);
                 }
 
-                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
                 if (layer.k_b) {
                     Kcur = ggml_add(ctx0, Kcur, layer.k_b);
                 }
 
-                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
                 if (layer.v_b) {
                     Vcur = ggml_add(ctx0, Vcur, layer.v_b);
                 }
 
+                if (layer.q_norm) {
+                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                    cb(Qcur, "Qcur_norm", il);
+                }
+
+                if (layer.k_norm) {
+                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                    cb(Kcur, "Kcur_norm", il);
+                }
+
                 Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
-                cb(cur, "attn_out", il);
             }
 
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, inpL);
-
-            inpL = cur; // inpL = residual, cur = hidden_states
-
-            cb(cur, "ffn_inp", il);
-
-            // layernorm2
-            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
-            cb(cur, "ffn_inp_normed", il);
-
-            // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
-
-            cb(cur, "ffn_out", il);
-
-            // residual 2
-            cur = ggml_add(ctx0, inpL, cur);
-            cb(cur, "layer_out", il);
-
-            inpL = cur;
-        }
-
-        // post-layernorm
-        if (model.post_ln_w) {
-            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
-        }
-
-        ggml_tensor * embeddings = inpL;
-
-        // process vision feature layers (used by granite)
-        {
-            // final layer is a vision feature layer
-            if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
-                embedding_stack.push_back(inpL);
-            }
-
-            // If feature layers are explicitly set, stack them (if we have multiple)
-            if (!embedding_stack.empty()) {
-                embeddings = embedding_stack[0];
-                for (size_t i = 1; i < embedding_stack.size(); i++) {
-                    embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
-                }
-            }
-        }
-
-        // llava projector (also used by granite)
-        if (ctx->model.hparams.has_llava_projector) {
-            embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
-
-            ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-            ggml_set_name(patches, "patches");
-            ggml_set_input(patches);
-
-            // shape [1, 576, 1024]
-            // ne is whcn, ne = [1024, 576, 1, 1]
-            embeddings = ggml_get_rows(ctx0, embeddings, patches);
-
-            // print_tensor_info(embeddings, "embeddings");
-
-            // llava projector
-            if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-                embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-                embeddings = ggml_gelu(ctx0, embeddings);
-                if (model.mm_2_w) {
-                    embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-                    embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-                }
-            }
-            else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-                embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-                // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
-                // First LayerNorm
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
-                                    model.mm_1_b);
-
-                // GELU activation
-                embeddings = ggml_gelu(ctx0, embeddings);
-
-                // Second linear layer
-                embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
-                embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
-
-                // Second LayerNorm
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
-                                    model.mm_4_b);
-            }
-            else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
-                // MobileVLM projector
-                int n_patch = 24;
-                ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
-                mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
-                mlp_1 = ggml_gelu(ctx0, mlp_1);
-                ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
-                mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-                // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
-
-                // block 1
-                ggml_tensor * block_1 = nullptr;
-                {
-                    // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
-                    mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
-                    mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
-                    // stride = 1, padding = 1, bias is nullptr
-                    block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
-
-                    // layer norm
-                    // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                    // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                    block_1 = ggml_norm(ctx0, block_1, eps);
-                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-
-                    // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                    // hardswish
-                    ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                    block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                    // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                    // pointwise conv
-                    block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
-                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
-                    block_1 = ggml_relu(ctx0, block_1);
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
-                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
-                    block_1 = ggml_hardsigmoid(ctx0, block_1);
-                    // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
-                    block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                    block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                    int w = block_1->ne[0], h = block_1->ne[1];
-                    block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-
-                    // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
-                    block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-                    // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                    block_1 = ggml_norm(ctx0, block_1, eps);
-                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                    // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                    // residual
-                    block_1 = ggml_add(ctx0, mlp_3, block_1);
-                }
-
-                // block_2
-                {
-                    // stride = 2
-                    block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
-
-                    // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                    // layer norm
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                    // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                    block_1 = ggml_norm(ctx0, block_1, eps);
-                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                    // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                    // hardswish
-                    ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                    // not sure the parameters is right for globalAvgPooling
-                    block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                    // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                    // pointwise conv
-                    block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
-                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
-                    block_1 = ggml_relu(ctx0, block_1);
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
-                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
-                    block_1 = ggml_hardsigmoid(ctx0, block_1);
-
-                    // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                    block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                    block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                    int w = block_1->ne[0], h = block_1->ne[1];
-                    block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                    // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
-                    block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-
-                    // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                    block_1 = ggml_norm(ctx0, block_1, eps);
-                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
-                    block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
-                    // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-                }
-                embeddings = block_1;
-            }
-            else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
-            {
-                int n_patch = 24;
-                ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-                mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
-                mlp_0 = ggml_gelu(ctx0, mlp_0);
-                ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
-                mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
-                // mlp_2 ne = [2048, 576, 1, 1]
-                // // AVG Pool Layer 2*2, strides = 2
-                mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
-                // mlp_2 ne = [576, 2048, 1, 1]
-                mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
-                // mlp_2 ne [24, 24, 2048, 1]
-                mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
-                // weight ne = [3, 3, 2048, 1]
-                ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
-                peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
-                peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
-                mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
-                peg_0 = ggml_add(ctx0, peg_0, mlp_2);
-                peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
-                embeddings = peg_0;
-            }
-            else {
-                GGML_ABORT("fatal error");
-            }
-        }
-
-        // glm projector
-        else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
-            size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
-            embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
-            embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
-            embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
-            embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
-            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
-            embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
-            // GLU
-            {
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-                embeddings = ggml_gelu_inplace(ctx0, embeddings);
-                ggml_tensor * x = embeddings;
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
-                x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
-                embeddings = ggml_swiglu_split(ctx0, embeddings, x);
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
-            }
-            // arrangement of BOI/EOI token embeddings
-            // note: these embeddings are not present in text model, hence we cannot process them as text tokens
-            // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
-            {
-                embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
-                embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
-            }
-        }
-
-        else {
-            GGML_ABORT("llava: unknown projector type");
-        }
-
-        // build the graph
-        ggml_build_forward_expand(gf, embeddings);
-
-        return gf;
-    }
-    // whisper encoder with custom projector
-    ggml_cgraph * build_whisper_enc() {
-        const int n_frames = img.nx;
-        const int n_pos    = n_frames / 2;
-        GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
-
-        ggml_tensor * inp = build_inp_raw(1);
-
-        // conv1d block
-        {
-            // convolution + gelu
-            ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
-            cur = ggml_add(ctx0, cur, model.conv1d_1_b);
-
-            cur = ggml_gelu_erf(ctx0, cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
-            cur = ggml_add(ctx0, cur, model.conv1d_2_b);
-
-            cur = ggml_gelu_erf(ctx0, cur);
-            // transpose
-            inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-            cb(inp, "after_conv1d", -1);
-        }
-
-        // sanity check (only check one layer, but it should be the same for all)
-        GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
-        GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
-        GGML_ASSERT(model.layers[0].q_b);
-        GGML_ASSERT(model.layers[0].v_b);
-        GGML_ASSERT(!model.layers[0].k_b); // no bias for k
-        GGML_ASSERT(model.post_ln_w && model.post_ln_b);
-
-        ggml_tensor * pos_embd_selected = ggml_view_2d(
-            ctx0, model.position_embeddings,
-            model.position_embeddings->ne[0], n_pos,
-            model.position_embeddings->nb[1], 0
-        );
-        ggml_tensor * cur = build_vit(
-                                inp, n_pos,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                pos_embd_selected,
-                                nullptr);
-
-        cb(cur, "after_transformer", -1);
-
-        if (model.audio_has_stack_frames()) {
-            // StackAudioFrames
-            // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
-            int64_t stride = n_embd * hparams.proj_stack_factor;
-            int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
-            int64_t pad = padded_len - ggml_nelements(cur);
-            if (pad > 0) {
-                cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
-                cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
-            }
-            cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
-                                ggml_row_size(cur->type, stride), 0);
-            cb(cur, "after_stacked", -1);
-        }
-
-        if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
-            // UltravoxProjector
-            // pre-norm
-            cur = ggml_rms_norm(ctx0, cur, 1e-6);
-            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
-
-            // ffn in
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-
-            // swiglu
-            // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
-            cur = ggml_swiglu_swapped(ctx0, cur);
-
-            // mid-norm
-            cur = ggml_rms_norm(ctx0, cur, 1e-6);
-            cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
-
-            // ffn out
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-
-        } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
-            // projector
-            cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_fc_b);
-
-        } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
-            // projector
-            cur = build_ffn(cur,
-                model.mm_1_w, model.mm_1_b,
-                nullptr, nullptr,
-                model.mm_2_w, model.mm_2_b,
-                FFN_GELU_ERF,
-                -1);
-
-        } else {
-            GGML_ABORT("%s: unknown projector type", __func__);
-        }
-
-        cb(cur, "projected", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // cogvlm vision encoder
-    ggml_cgraph * build_cogvlm() {
-        GGML_ASSERT(model.class_embedding != nullptr);
-        GGML_ASSERT(model.position_embeddings != nullptr);
-
-        const int n_pos = n_patches + 1; // +1 for [CLS]
-
-        // build input and concatenate class embedding
-        ggml_tensor * inp = build_inp();
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-        inp = ggml_add(ctx0, inp, model.position_embeddings);
-        cb(inp, "inp_pos", -1);
-
-        ggml_tensor * inpL = inp;
-
-        for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.layers[il];
-            ggml_tensor * cur = inpL;
-
-            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-
-            cur = ggml_add(ctx0, cur, layer.qkv_b);
-
-            ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-                cur->nb[1], 0);
-            ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-                cur->nb[1], n_embd * sizeof(float));
-            ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-                cur->nb[1], 2 * n_embd * sizeof(float));
-
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
             cb(Vcur, "Vcur", il);
 
+            if (add_pos) {
+                Qcur = add_pos(Qcur, layer);
+                Kcur = add_pos(Kcur, layer);
+                cb(Qcur, "Qcur_pos", il);
+                cb(Kcur, "Kcur_pos", il);
+            }
+
             cur = build_attn(layer.o_w, layer.o_b,
                 Qcur, Kcur, Vcur, nullptr, kq_scale, il);
             cb(cur, "attn_out", il);
-
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
-            cb(cur, "attn_post_norm", il);
-
-            cur = ggml_add(ctx0, cur, inpL);
-            inpL = cur;
-
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
-
-            cb(cur, "ffn_out", il);
-
-            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
-            cb(cur, "ffn_post_norm", il);
-
-            cur = ggml_add(ctx0, cur, inpL);
-            cb(cur, "layer_out", il);
-            inpL = cur;
-
         }
 
-        // remove CLS token (like build_llama4 does)
-        ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
-            n_embd, n_patches,
-            ggml_row_size(inpL->type, n_embd), 0);
+        if (layer.ls_1_w) {
+            cur = ggml_mul(ctx0, cur, layer.ls_1_w);
+            cb(cur, "attn_out_scaled", il);
+        }
 
-        // Multiply with mm_model_proj
-        cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
 
-        // Apply layernorm, weight, bias
-        cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        inpL = cur; // inpL = residual, cur = hidden_states
 
-        // Apply GELU
-        cur = ggml_gelu_inplace(ctx0, cur);
+        cb(cur, "ffn_inp", il);
 
-        // Branch 1: multiply with mm_h_to_4h_w
-        ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
 
-        // Branch 2: multiply with mm_gate_w
-        ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            ffn_t, il);
 
-        // Apply silu
-        gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+        cb(cur, "ffn_out", il);
 
-        // Apply mm_4h_to_h_w
-        cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+        if (layer.ls_2_w) {
+            cur = ggml_mul(ctx0, cur, layer.ls_2_w);
+            cb(cur, "ffn_out_scaled", il);
+        }
 
-        // Concatenate with boi and eoi
-        cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
-        cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
 
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
+        inpL = cur;
     }
 
-private:
-    //
-    // utility functions
-    //
-
-    void cb(ggml_tensor * cur0, const char * name, int il) const {
-        if (ctx->debug_graph) {
-            ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
-            std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
-            ggml_set_name(cur, cur_name.c_str());
-            ggml_set_output(cur);
-            ggml_build_forward_expand(gf, cur);
-            ctx->debug_print_tensors.push_back(cur);
-        }
+    if (model.audio_has_avgpool()) {
+        ggml_tensor * cur = inpL;
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont(ctx0, cur);
+        cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont(ctx0, cur);
+        inpL = cur;
     }
 
-    // siglip2 naflex
-    ggml_tensor * resize_position_embeddings() {
-        ggml_tensor * pos_embd = model.position_embeddings;
-        const int height       = img.ny / patch_size;
-        const int width        = img.nx / patch_size;
-        const uint32_t mode    = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
-        const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
-
-        GGML_ASSERT(pos_embd);
-
-        if (height == n_per_side && width == n_per_side) {
-            return pos_embd;
-        }
-
-        pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side);  // -> (n_embd, n_per_side, n_per_side)
-        pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3);                         // -> (n_per_side, n_per_side, n_embd)
-        pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
-        pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3);                         // -> (n_embd, width, height)
-        pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);             // -> (n_embd, width * height)
-
-        return pos_embd;
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
     }
+    return inpL;
+}
 
-    // build vision transformer (ViT) cgraph
-    // this function should cover most of the models
-    // if your model has specific features, you should probably duplicate this function
-    ggml_tensor * build_vit(
-                ggml_tensor * inp,
-                int64_t n_pos,
-                norm_type norm_t,
-                ffn_op_type ffn_t,
-                ggml_tensor * learned_pos_embd,
-                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
-            ) {
-        if (learned_pos_embd) {
-            inp = ggml_add(ctx0, inp, learned_pos_embd);
-            cb(inp, "pos_embed", -1);
-        }
 
-        ggml_tensor * inpL = inp;
-
-        // pre-layernorm
-        if (model.pre_ln_w) {
-            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-            cb(inpL, "pre_ln", -1);
-        }
-
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.layers[il];
-            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-            cb(cur, "layer_inp_normed", il);
-
-            // self-attention
-            {
-                ggml_tensor * Qcur = nullptr;
-                ggml_tensor * Kcur = nullptr;
-                ggml_tensor * Vcur = nullptr;
-                if (layer.qkv_w != nullptr) {
-                    // fused qkv
-                    cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-                    if (layer.qkv_b != nullptr) {
-                        cur = ggml_add(ctx0, cur, layer.qkv_b);
-                    }
-
-                    Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ 0);
-
-                    Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ ggml_row_size(cur->type, n_embd));
-
-                    Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                        /* nb1    */ ggml_row_size(cur->type, d_head),
-                        /* nb2    */ cur->nb[1],
-                        /* offset */ ggml_row_size(cur->type, 2 * n_embd));
-
-                    // TODO: q/k norm requires row size == n_embd, while here it's d_head
-                    // we can add support in the future if needed
-                    GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
-
-                } else {
-                    // separate q, k, v
-                    Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
-                    if (layer.q_b) {
-                        Qcur = ggml_add(ctx0, Qcur, layer.q_b);
-                    }
-
-                    Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-                    if (layer.k_b) {
-                        Kcur = ggml_add(ctx0, Kcur, layer.k_b);
-                    }
-
-                    Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-                    if (layer.v_b) {
-                        Vcur = ggml_add(ctx0, Vcur, layer.v_b);
-                    }
-
-                    if (layer.q_norm) {
-                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
-                        cb(Qcur, "Qcur_norm", il);
-                    }
-
-                    if (layer.k_norm) {
-                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
-                        cb(Kcur, "Kcur_norm", il);
-                    }
-
-                    Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-                    Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
-                }
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                if (add_pos) {
-                    Qcur = add_pos(Qcur, layer);
-                    Kcur = add_pos(Kcur, layer);
-                    cb(Qcur, "Qcur_pos", il);
-                    cb(Kcur, "Kcur_pos", il);
-                }
-
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
-                cb(cur, "attn_out", il);
-            }
-
-            if (layer.ls_1_w) {
-                cur = ggml_mul(ctx0, cur, layer.ls_1_w);
-                cb(cur, "attn_out_scaled", il);
-            }
-
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, inpL);
-
-            inpL = cur; // inpL = residual, cur = hidden_states
-
-            cb(cur, "ffn_inp", il);
-
-            // layernorm2
-            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-            cb(cur, "ffn_inp_normed", il);
-
-            // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                ffn_t, il);
-
-            cb(cur, "ffn_out", il);
-
-            if (layer.ls_2_w) {
-                cur = ggml_mul(ctx0, cur, layer.ls_2_w);
-                cb(cur, "ffn_out_scaled", il);
-            }
-
-            // residual 2
-            cur = ggml_add(ctx0, inpL, cur);
-            cb(cur, "layer_out", il);
-
-            inpL = cur;
-        }
-
-        if (ctx->model.audio_has_avgpool()) {
-            ggml_tensor * cur = inpL;
-            cur = ggml_transpose(ctx0, cur);
-            cur = ggml_cont(ctx0, cur);
-            cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
-            cur = ggml_transpose(ctx0, cur);
-            cur = ggml_cont(ctx0, cur);
-            inpL = cur;
-        }
-
-        // post-layernorm
-        if (model.post_ln_w) {
-            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
-        }
-        return inpL;
-    }
-
-    static ggml_tensor * get_rel_pos(
-        ggml_context * ctx,
-        ggml_tensor * rel_pos,     // [L, C]
-        ggml_tensor * indices,     // [q_size, k_size]
-        int q_size,
-        int k_size
-    ) {
-        const int64_t C = rel_pos->ne[0];   // channels
-        const int64_t L = rel_pos->ne[1];   // length
-
-        GGML_ASSERT(indices != nullptr);
-        GGML_ASSERT(indices->type == GGML_TYPE_I32);
-        GGML_ASSERT(indices->ne[0] == k_size);
-        GGML_ASSERT(indices->ne[1] == q_size);
-
-        const auto max_rel_dist = 2*std::max(q_size, k_size) - 1;
-        ggml_tensor * cur = rel_pos;
-
-        if (max_rel_dist != L) {
-            // Linear interpolation
-            int64_t ne0 = cur->ne[0];
-            int64_t ne1 = cur->ne[1];
-            int64_t ne2 = cur->ne[2];
-            int64_t ne3 = cur->ne[3];
-
-            cur = ggml_reshape_3d(
-                ctx,
-                ggml_cont(ctx, ggml_permute(ctx, cur, 1, 0, 2, 3)),
-                ne1, 1, ne0*ne2*ne3
-            );
-            cur = ggml_reshape_4d(
-                ctx,
-                ggml_interpolate(
-                    ctx,
-                    cur,
-                    max_rel_dist, 1, ne0*ne2*ne3, 1,
-                    ggml_scale_mode::GGML_SCALE_MODE_BILINEAR
-                ),
-                max_rel_dist, ne0, ne2, ne3
-            );
-            cur = ggml_cont(ctx, ggml_permute(ctx, cur, 1, 0, 2, 3));
-        }
-
-        // Flatten indices to 1D for ggml_get_rows
-        int qk = q_size * k_size;
-
-        cur = ggml_reshape_3d(
-            ctx,
-            ggml_get_rows(ctx, cur, ggml_reshape_1d(ctx, indices, qk)),
-            C, k_size, q_size
-        );
-
-        return cur;   // [C, k_size, q_size]
-    }
-
-    // Implementation based on approach suggested by Acly
-    // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
-    static ggml_tensor* window_partition(ggml_context* ctx, ggml_tensor* x, int window) {
-        auto [c, w, h, b] = x->ne;
-        // same as
-        // x = ggml_win_part(m, x, window);
-        // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
-
-        int64_t px = (window - w % window) % window;
-        int64_t py = (window - h % window) % window;
-        int64_t npw = (w + px) / window;
-        int64_t nph = (h + py) / window;
-
-        if (px > 0 || py > 0) {
-            x = ggml_pad(ctx, x, 0, int(px), int(py), 0);
-        }
-        x = ggml_reshape_4d(ctx, x, c * window, npw, window, nph * b);
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));
-        x = ggml_reshape_4d(ctx, x, c, window, window, npw * nph * b);
-        return x;
-    }
-
-    // Implementation based on approach suggested by Acly
-    // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
-    static ggml_tensor* window_unpartition(ggml_context* m, ggml_tensor* x, int w, int h, int window) {
-        int64_t c = x->ne[0];
-        // same as
-        // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
-        // x = ggml_win_unpart(m, x, w, h, window);
-
-        int64_t px = (window - w % window) % window;
-        int64_t py = (window - h % window) % window;
-        int64_t npw = (w + px) / window;
-        int64_t nph = (h + py) / window;
-
-        int64_t b = x->ne[3] / (npw * nph);
-        x = ggml_reshape_4d(m, x, c * window, window, npw, nph * b);
-        x = ggml_cont(m, ggml_permute(m, x, 0, 2, 1, 3));
-        x = ggml_reshape_4d(m, x, c, w + px, h + py, b);
-        x = ggml_view_4d(m, x, x->ne[0], w, h, x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
-        x = ggml_cont(m, x);
-        return x;
-    }
 
     // build the input after conv2d (inp_raw --> patches)
     // returns tensor with shape [n_embd, n_patches]
-    ggml_tensor * build_inp() {
-        ggml_tensor * inp_raw = build_inp_raw();
-        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
-        inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-        if (model.patch_bias) {
-            inp = ggml_add(ctx0, inp, model.patch_bias);
-            cb(inp, "patch_bias", -1);
-        }
-        return inp;
+    ggml_tensor * clip_graph::build_inp() {
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+    if (model.patch_bias) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+    return inp;
+}
+
+ggml_tensor * clip_graph::build_inp_raw(int channels) {
+    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+    return inp_raw;
+}
+
+ggml_tensor * clip_graph::build_norm(
+        ggml_tensor * cur,
+        ggml_tensor * mw,
+        ggml_tensor * mb,
+        norm_type type,
+        float norm_eps,
+        int il) const {
+
+    cur = type == NORM_TYPE_RMS
+        ? ggml_rms_norm(ctx0, cur, norm_eps)
+        : ggml_norm(ctx0, cur, norm_eps);
+
+    if (mw || mb) {
+        cb(cur, "norm", il);
     }
 
-    ggml_tensor * build_inp_raw(int channels = 3) {
-        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
-        ggml_set_name(inp_raw, "inp_raw");
-        ggml_set_input(inp_raw);
-        return inp_raw;
-    }
-
-    ggml_tensor * build_norm(
-            ggml_tensor * cur,
-            ggml_tensor * mw,
-            ggml_tensor * mb,
-            norm_type type,
-            float norm_eps,
-            int il) const {
-
-        cur = type == NORM_TYPE_RMS
-            ? ggml_rms_norm(ctx0, cur, norm_eps)
-            : ggml_norm(ctx0, cur, norm_eps);
-
-        if (mw || mb) {
-            cb(cur, "norm", il);
-        }
-
-        if (mw) {
-            cur = ggml_mul(ctx0, cur, mw);
-            if (mb) {
-                cb(cur, "norm_w", il);
-            }
-        }
-
+    if (mw) {
+        cur = ggml_mul(ctx0, cur, mw);
         if (mb) {
-            cur = ggml_add(ctx0, cur, mb);
+            cb(cur, "norm_w", il);
         }
-
-        return cur;
     }
 
-    ggml_tensor * build_ffn(
-            ggml_tensor * cur,
-            ggml_tensor * up,
-            ggml_tensor * up_b,
-            ggml_tensor * gate,
-            ggml_tensor * gate_b,
-            ggml_tensor * down,
-            ggml_tensor * down_b,
-            ffn_op_type type_op,
-            int il) const {
-
-        ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
-        cb(tmp, "ffn_up", il);
-
-        if (up_b) {
-            tmp = ggml_add(ctx0, tmp, up_b);
-            cb(tmp, "ffn_up_b", il);
-        }
-
-        if (gate) {
-            cur = ggml_mul_mat(ctx0, gate, cur);
-            cb(cur, "ffn_gate", il);
-
-            if (gate_b) {
-                cur = ggml_add(ctx0, cur, gate_b);
-                cb(cur, "ffn_gate_b", il);
-            }
-        } else {
-            cur = tmp;
-        }
-
-        // we only support parallel ffn for now
-        switch (type_op) {
-            case FFN_SILU:
-                if (gate) {
-                    cur = ggml_swiglu_split(ctx0, cur, tmp);
-                    cb(cur, "ffn_swiglu", il);
-                } else {
-                    cur = ggml_silu(ctx0, cur);
-                    cb(cur, "ffn_silu", il);
-                } break;
-            case FFN_GELU:
-                if (gate) {
-                    cur = ggml_geglu_split(ctx0, cur, tmp);
-                    cb(cur, "ffn_geglu", il);
-                } else {
-                    cur = ggml_gelu(ctx0, cur);
-                    cb(cur, "ffn_gelu", il);
-                } break;
-            case FFN_GELU_ERF:
-                if (gate) {
-                    cur = ggml_geglu_erf_split(ctx0, cur, tmp);
-                    cb(cur, "ffn_geglu_erf", il);
-                } else {
-                    cur = ggml_gelu_erf(ctx0, cur);
-                    cb(cur, "ffn_gelu_erf", il);
-                } break;
-            case FFN_GELU_QUICK:
-                if (gate) {
-                    cur = ggml_geglu_quick_split(ctx0, cur, tmp);
-                    cb(cur, "ffn_geglu_quick", il);
-                } else {
-                    cur = ggml_gelu_quick(ctx0, cur);
-                    cb(cur, "ffn_gelu_quick", il);
-                } break;
-        }
-
-        if (down) {
-            cur = ggml_mul_mat(ctx0, down, cur);
-        }
-
-        if (down_b) {
-            cb(cur, "ffn_down", il);
-        }
-
-        if (down_b) {
-            cur = ggml_add(ctx0, cur, down_b);
-        }
-
-        return cur;
+    if (mb) {
+        cur = ggml_add(ctx0, cur, mb);
     }
 
-    ggml_tensor * build_attn(
-            ggml_tensor * wo,
-            ggml_tensor * wo_b,
-            ggml_tensor * q_cur,
-            ggml_tensor * k_cur,
-            ggml_tensor * v_cur,
-            ggml_tensor * kq_mask,
-            float kq_scale,
-            int il) const {
-        // these nodes are added to the graph together so that they are not reordered
-        // by doing so, the number of splits in the graph is reduced
-        ggml_build_forward_expand(gf, q_cur);
-        ggml_build_forward_expand(gf, k_cur);
-        ggml_build_forward_expand(gf, v_cur);
+    return cur;
+}
 
-        ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-        //cb(q, "q", il);
+ggml_tensor * clip_graph::build_ffn(
+        ggml_tensor * cur,
+        ggml_tensor * up,
+        ggml_tensor * up_b,
+        ggml_tensor * gate,
+        ggml_tensor * gate_b,
+        ggml_tensor * down,
+        ggml_tensor * down_b,
+        ffn_op_type type_op,
+        int il) const {
 
-        ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-        //cb(k, "k", il);
+    ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
+    cb(tmp, "ffn_up", il);
 
-        ggml_tensor * cur;
+    if (up_b) {
+        tmp = ggml_add(ctx0, tmp, up_b);
+        cb(tmp, "ffn_up_b", il);
+    }
 
-        if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-            ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    if (gate) {
+        cur = ggml_mul_mat(ctx0, gate, cur);
+        cb(cur, "ffn_gate", il);
 
-            k = ggml_cast(ctx0, k, GGML_TYPE_F16);
-            v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+        if (gate_b) {
+            cur = ggml_add(ctx0, cur, gate_b);
+            cb(cur, "ffn_gate_b", il);
+        }
+    } else {
+        cur = tmp;
+    }
 
-            cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
-            ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+    // we only support parallel ffn for now
+    switch (type_op) {
+        case FFN_SILU:
+            if (gate) {
+                cur = ggml_swiglu_split(ctx0, cur, tmp);
+                cb(cur, "ffn_swiglu", il);
+            } else {
+                cur = ggml_silu(ctx0, cur);
+                cb(cur, "ffn_silu", il);
+            } break;
+        case FFN_GELU:
+            if (gate) {
+                cur = ggml_geglu_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu", il);
+            } else {
+                cur = ggml_gelu(ctx0, cur);
+                cb(cur, "ffn_gelu", il);
+            } break;
+        case FFN_GELU_ERF:
+            if (gate) {
+                cur = ggml_geglu_erf_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu_erf", il);
+            } else {
+                cur = ggml_gelu_erf(ctx0, cur);
+                cb(cur, "ffn_gelu_erf", il);
+            } break;
+        case FFN_GELU_QUICK:
+            if (gate) {
+                cur = ggml_geglu_quick_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu_quick", il);
+            } else {
+                cur = ggml_gelu_quick(ctx0, cur);
+                cb(cur, "ffn_gelu_quick", il);
+            } break;
+    }
 
-            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+    if (down) {
+        cur = ggml_mul_mat(ctx0, down, cur);
+    }
 
-        } else {
-            ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
-            v = ggml_cont(ctx0, v);
+    if (down_b) {
+        cb(cur, "ffn_down", il);
+    }
+
+    if (down_b) {
+        cur = ggml_add(ctx0, cur, down_b);
+    }
+
+    return cur;
+}
+
+ggml_tensor * clip_graph::build_attn(
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_mask,
+        float kq_scale,
+        int il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * cur;
+
+    if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+        ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+
+        k = ggml_cast(ctx0, k, GGML_TYPE_F16);
+        v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+
+    } else {
+        ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+        v = ggml_cont(ctx0, v);
 
             ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
             // F32 may not needed for vision encoders?
             // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
 
-            kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
 
-            ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-            cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-            cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
-        }
-
-        cb(cur, "kqv_out", il);
-
-        if (wo) {
-            cur = ggml_mul_mat(ctx0, wo, cur);
-        }
-
-        if (wo_b) {
-            cur = ggml_add(ctx0, cur, wo_b);
-        }
-
-        return cur;
+        ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
     }
 
-    // implementation of the 2D RoPE without adding a new op in ggml
-    // this is not efficient (use double the memory), but works on all backends
-    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
-    static ggml_tensor * build_rope_2d(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * pos_a, // first half
-        ggml_tensor * pos_b, // second half
-        const float freq_base,
-        const bool interleave_freq
-    ) {
-        const int64_t n_dim  = cur->ne[0];
-        const int64_t n_head = cur->ne[1];
-        const int64_t n_pos  = cur->ne[2];
+    cb(cur, "kqv_out", il);
 
-        // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
-        // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
-        // first half of cur will use 1e-0, 1e-2 (even)
-        // second half of cur will use 1e-1, 1e-3 (odd)
-        // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
-        //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
-        // then for the second half, we use freq_scale to shift the inv_freq
-        //  ^ why? replace (2i) with (2i+1) in the above equation
-        const float freq_scale_odd = interleave_freq
-                                    ? std::pow(freq_base, (float)-2/n_dim)
-                                    : 1.0;
-
-        // first half
-        ggml_tensor * first;
-        {
-            first = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
-                ggml_row_size(cur->type, n_dim),
-                ggml_row_size(cur->type, n_dim*n_head),
-                0);
-            first = ggml_rope_ext(
-                ctx0,
-                first,
-                pos_a,      // positions
-                nullptr,    // freq factors
-                n_dim/2,    // n_dims
-                0, 0, freq_base,
-                1.0f, 0.0f, 1.0f, 0.0f, 0.0f
-            );
-        }
-
-        // second half
-        ggml_tensor * second;
-        {
-            second = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
-                ggml_row_size(cur->type, n_dim),
-                ggml_row_size(cur->type, n_dim*n_head),
-                n_dim/2 * ggml_element_size(cur));
-            second = ggml_rope_ext(
-                ctx0,
-                second,
-                pos_b,      // positions
-                nullptr,    // freq factors
-                n_dim/2,    // n_dims
-                0, 0, freq_base,
-                freq_scale_odd,
-                0.0f, 1.0f, 0.0f, 0.0f
-            );
-        }
-
-        cur = ggml_concat(ctx0, first, second, 0);
-        return cur;
+    if (wo) {
+        cur = ggml_mul_mat(ctx0, wo, cur);
     }
 
-    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
-    // support dynamic resolution
-    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
-        GGML_ASSERT(scale_factor > 1);
-
-        const int n_embd = cur->ne[0];
-        int width  = img.nx / patch_size;
-        int height = img.ny / patch_size;
-
-        // pad width and height to factor
-        const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
-        const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
-        cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
-        if (pad_width || pad_height) {
-            cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
-            width  += pad_width;
-            height += pad_height;
-        }
-
-        // unshuffle h
-        cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-
-        // unshuffle w
-        cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-
-        cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
-        cb(cur, "pixel_shuffle", -1);
-
-        return cur;
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
     }
 
-    ggml_tensor * build_sam(ggml_tensor * inp_raw) {
-        const int n_embd  = hparams.sam_n_embd;
-        const int n_layer = hparams.sam_n_layer;
-        const int n_heads = hparams.sam_n_head;
-        const int d_heads = n_embd / n_heads;
-        const int window = hparams.attn_window_size;
+    return cur;
+}
 
-        ggml_tensor * inpL;
+// implementation of the 2D RoPE without adding a new op in ggml
+// this is not efficient (use double the memory), but works on all backends
+// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+ggml_tensor * clip_graph::build_rope_2d(
+    ggml_context * ctx0,
+    ggml_tensor * cur,
+    ggml_tensor * pos_a, // first half
+    ggml_tensor * pos_b, // second half
+    const float freq_base,
+    const bool interleave_freq
+) {
+    const int64_t n_dim  = cur->ne[0];
+    const int64_t n_head = cur->ne[1];
+    const int64_t n_pos  = cur->ne[2];
 
-        inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
-        inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
-        inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
+    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
+    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
+    // first half of cur will use 1e-0, 1e-2 (even)
+    // second half of cur will use 1e-1, 1e-3 (odd)
+    // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
+    //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
+    // then for the second half, we use freq_scale to shift the inv_freq
+    //  ^ why? replace (2i) with (2i+1) in the above equation
+    const float freq_scale_odd = interleave_freq
+                                ? std::pow(freq_base, (float)-2/n_dim)
+                                : 1.0;
 
-        ggml_tensor * rel_pos_indices_local;
-        ggml_tensor * rel_pos_indices_global;
-
-        rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
-        rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
-        ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
-        ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
-        ggml_set_input(rel_pos_indices_local);
-        ggml_set_input(rel_pos_indices_global);
-
-        ggml_tensor * cur;
-        const auto tgt_size = inpL->ne[1];
-        const auto str_size = model.pos_embed->ne[1];
-
-        if (str_size != tgt_size) {
-            ggml_tensor * old_pos_embed = nullptr;
-            old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
-            ggml_tensor * new_pos_embed = ggml_interpolate(
-                ctx0,
-                old_pos_embed,
-                tgt_size,
-                tgt_size,
-                n_embd,
-                1,
-                ggml_scale_mode::GGML_SCALE_MODE_BICUBIC
-                );
-            new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
-            cur = ggml_add(ctx0, inpL, new_pos_embed);
-        } else {
-            cur = ggml_add(ctx0, inpL, model.pos_embed);
-        }
-
-        // loop over layers
-        for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.sam_layers[il];
-            ggml_tensor * shortcut = cur;
-
-            // layernorm1
-            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
-
-            const int64_t w0 = cur->ne[1];
-            const int64_t h0 = cur->ne[2];
-
-            ggml_tensor * indices;
-
-            if (hparams.is_global_attn(il)) {
-                indices = rel_pos_indices_global;
-            } else {
-                // local attention layer - apply window partition
-                cur = window_partition(ctx0, cur, window);
-                indices = rel_pos_indices_local;
-            }
-
-            const int64_t W = cur->ne[1];
-            const int64_t H = cur->ne[2];
-            // self-attention
-            {
-                const int B = cur->ne[3];
-
-                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-                cur = ggml_add(ctx0, cur, layer.qkv_b);
-                cur = ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
-                cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W*H, B);
-
-                ggml_tensor * Q;
-                ggml_tensor * K;
-                ggml_tensor * V;
-
-                Q = ggml_view_3d   (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 0*cur->nb[1]);
-                Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W*H, B);
-
-                K = ggml_view_3d   (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 1*cur->nb[1]);
-                K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W*H, B);
-
-                V = ggml_view_3d   (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 2*cur->nb[1]);
-                V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W*H, B);
-
-                ggml_tensor * mask;
-                ggml_tensor * rw;
-                ggml_tensor * rh;
-                ggml_tensor * qr;
-
-                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
-                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
-                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
-                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
-
-
-                rw   = ggml_mul_mat   (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
-                rw   = ggml_cont      (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
-                rw   = ggml_reshape_4d(ctx0, rw, W, 1, W*H, n_heads*B);
-                rw   = ggml_repeat_4d (ctx0, rw, W, H, W*H, n_heads*B);
-                rh   = ggml_mul_mat   (ctx0, rh, qr); // [B*n_heads, H, W, H]
-                rh   = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B);
-                mask = ggml_add       (ctx0, rw, rh); // [B*n_heads, H*W, H, W]
-                mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B);
-                mask = ggml_cast      (ctx0, mask, GGML_TYPE_F16);
-
-                float scale = 1.0f / sqrtf((float)d_heads);
-
-                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
-                                 il);  // [B, H*W, n_embd]
-                cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
-            }
-
-            if (hparams.is_global_attn(il) == false) {
-                // local attention layer - reverse window partition
-                cur = window_unpartition(ctx0, cur, w0, h0, window);
-            }
-
-            // re-add the layer input, e.g., residual
-            cur = ggml_add(ctx0, cur, shortcut);
-
-            ggml_tensor * inpFF = cur;
-
-            // layernorm2
-            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
-
-            // ffn
-            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w,
-                            layer.ff_down_b, hparams.ffn_op, il);
-
-            // residual 2
-            cur = ggml_add(ctx0, cur, inpFF);
-            cb(cur, "sam_layer_out", il);
-        }
-
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
-
-        cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
-
-        cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-        cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
-
-        cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
-        cb(cur, "sam_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-        return cur;
+    // first half
+    ggml_tensor * first;
+    {
+        first = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            ggml_row_size(cur->type, n_dim),
+            ggml_row_size(cur->type, n_dim*n_head),
+            0);
+        first = ggml_rope_ext(
+            ctx0,
+            first,
+            pos_a,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            1.0f, 0.0f, 1.0f, 0.0f, 0.0f
+        );
     }
 
-    ggml_tensor * build_dsocr_clip(ggml_tensor * patch_embeds) {
-        ggml_tensor * inp;
-
-        inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
-        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0]*inp->ne[1], inp->ne[2]);
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-        ggml_tensor * new_pos_embd = ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
-
-        int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
-        const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
-        const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
-
-        if (tgt_size != src_size) {
-            ggml_tensor * old_pos_embd;
-            ggml_tensor * cls_tok;
-
-            old_pos_embd = ggml_view_2d(
-                ctx0, new_pos_embd,
-                new_pos_embd->ne[0], src_size * src_size,
-                ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0
-            );
-            cls_tok = ggml_view_2d(
-                ctx0, new_pos_embd,
-                new_pos_embd->ne[0], 1,
-                ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size
-            );
-            new_pos_embd = ggml_interpolate(ctx0,
-                old_pos_embd,
-                tgt_size,
-                tgt_size,
-                new_pos_embd->ne[0], 1, GGML_SCALE_MODE_BICUBIC
-            );
-            new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
-            new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
-            n_pos = tgt_size * tgt_size + 1;
-        }
-
-        // add CLS token
-        inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
-
-        // for selecting learned pos embd, used by ViT
-        ggml_tensor * positions =  ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
-        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
-
-        ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK,
-                                      learned_pos_embd, nullptr);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return cur;
+    // second half
+    ggml_tensor * second;
+    {
+        second = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            ggml_row_size(cur->type, n_dim),
+            ggml_row_size(cur->type, n_dim*n_head),
+            n_dim/2 * ggml_element_size(cur));
+        second = ggml_rope_ext(
+            ctx0,
+            second,
+            pos_b,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            freq_scale_odd,
+            0.0f, 1.0f, 0.0f, 0.0f
+        );
     }
-};
+
+    cur = ggml_concat(ctx0, first, second, 0);
+    return cur;
+}
+
+// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+// support dynamic resolution
+ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
+    GGML_ASSERT(scale_factor > 1);
+
+    const int n_embd = cur->ne[0];
+    int width  = img.nx / patch_size;
+    int height = img.ny / patch_size;
+
+    // pad width and height to factor
+    const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
+    const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
+    cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
+    if (pad_width || pad_height) {
+        cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
+        width  += pad_width;
+        height += pad_height;
+    }
+
+    // unshuffle h
+    cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+    // unshuffle w
+    cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+    cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+    cb(cur, "pixel_shuffle", -1);
+
+    return cur;
+}
+
+    
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
     GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
-    clip_graph graph(ctx, *imgs.entries[0]);
 
-    ggml_cgraph * res;
+    const clip_image_f32 & img = *imgs.entries[0];
+    std::unique_ptr<clip_graph> builder;
 
     switch (ctx->proj_type()) {
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
         case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_JANUS_PRO:
             {
-                res = graph.build_siglip();
+                builder = std::make_unique<clip_graph_siglip>(ctx, img);
             } break;
         case PROJECTOR_TYPE_PIXTRAL:
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
-                res = graph.build_pixtral();
+                builder = std::make_unique<clip_graph_pixtral>(ctx, img);
             } break;
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
             {
-                res = graph.build_qwen2vl();
+                builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_QWEN3VL:
             {
-                res = graph.build_qwen3vl();
+                builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_MINICPMV:
             {
-                res = graph.build_minicpmv();
+                builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
             } break;
         case PROJECTOR_TYPE_INTERNVL:
             {
-                res = graph.build_internvl();
+                builder = std::make_unique<clip_graph_internvl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_LLAMA4:
             {
-                res = graph.build_llama4();
+                builder = std::make_unique<clip_graph_llama4>(ctx, img);
             } break;
         case PROJECTOR_TYPE_ULTRAVOX:
         case PROJECTOR_TYPE_VOXTRAL:
         case PROJECTOR_TYPE_QWEN2A:
             {
-                res = graph.build_whisper_enc();
+                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
             } break;
         case PROJECTOR_TYPE_KIMIVL:
             {
-                res = graph.build_kimivl();
-            } break;
-        case PROJECTOR_TYPE_JANUS_PRO:
-            {
-                res = graph.build_siglip();
+                builder = std::make_unique<clip_graph_kimivl>(ctx, img);
             } break;
         case PROJECTOR_TYPE_COGVLM:
             {
-                res = graph.build_cogvlm();
+                builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_GLM_EDGE:
+            {
+                builder = std::make_unique<clip_graph_llava>(ctx, img);
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
-        {
-            res = graph.build_deepseek_ocr();
-        } break;
-        default:
             {
-                res = graph.build_llava();
+                builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
             } break;
+        default:
+            GGML_ABORT("missing cgraph builder");
     }
-    return res;
+
+    return builder->build();
 }
 
+//
+// clip_model_loader
+//
+
 struct clip_model_loader {
     ggml_context_ptr ctx_meta;
     gguf_context_ptr ctx_gguf;
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index e4f6566e15..144a3d7b44 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -7,6 +7,8 @@
 
 // !!! Internal header, to be used by mtmd only !!!
 
+#define MTMD_INTERNAL_HEADER
+
 struct clip_ctx;
 
 struct clip_image_size {
diff --git a/tools/mtmd/models/cogvlm.cpp b/tools/mtmd/models/cogvlm.cpp
new file mode 100644
index 0000000000..d5b739c687
--- /dev/null
+++ b/tools/mtmd/models/cogvlm.cpp
@@ -0,0 +1,98 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_cogvlm::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // build input and concatenate class embedding
+    ggml_tensor * inp = build_inp();
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    inp = ggml_add(ctx0, inp, model.position_embeddings);
+    cb(inp, "inp_pos", -1);
+
+    ggml_tensor * inpL = inp;
+
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL;
+
+        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+
+        cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 0);
+        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], n_embd * sizeof(float));
+        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 2 * n_embd * sizeof(float));
+
+        cb(Qcur, "Qcur", il);
+        cb(Kcur, "Kcur", il);
+        cb(Vcur, "Vcur", il);
+
+        cur = build_attn(layer.o_w, layer.o_b,
+            Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+        cb(cur, "attn_out", il);
+
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "attn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        inpL = cur;
+
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        cb(cur, "layer_out", il);
+        inpL = cur;
+
+    }
+
+    // remove CLS token (like build_llama4 does)
+    ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
+        n_embd, n_patches,
+        ggml_row_size(inpL->type, n_embd), 0);
+
+    // Multiply with mm_model_proj
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+
+    // Apply layernorm, weight, bias
+    cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+
+    // Apply GELU
+    cur = ggml_gelu_inplace(ctx0, cur);
+
+    // Branch 1: multiply with mm_h_to_4h_w
+    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+
+    // Branch 2: multiply with mm_gate_w
+    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+
+    // Apply silu
+    gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+
+    // Apply mm_4h_to_h_w
+    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+
+    // Concatenate with boi and eoi
+    cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+    cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
new file mode 100644
index 0000000000..156b917b9a
--- /dev/null
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -0,0 +1,325 @@
+#include "models.h"
+
+// Implementation based on approach suggested by Acly
+// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
+static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) {
+    auto [c, w, h, b] = x->ne;
+    // same as
+    // x = ggml_win_part(m, x, window);
+    // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
+
+    const int64_t px  = (window - w % window) % window;
+    const int64_t py  = (window - h % window) % window;
+    const int64_t npw = (w + px) / window;
+    const int64_t nph = (h + py) / window;
+
+    ggml_tensor * cur = x;
+    if (px > 0 || py > 0) {
+        cur = ggml_pad(ctx0, cur, 0, static_cast<int>(px), static_cast<int>(py), 0);
+    }
+    cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+    cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b);
+    return cur;
+}
+
+// Implementation based on approach suggested by Acly
+// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
+static ggml_tensor * window_unpartition(ggml_context * ctx0,
+                                        ggml_tensor *  x,
+                                        const int      w,
+                                        const int      h,
+                                        const int      window) {
+    const int64_t c = x->ne[0];
+    // same as
+    // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
+    // x = ggml_win_unpart(m, x, w, h, window);
+
+    const int64_t px  = (window - w % window) % window;
+    const int64_t py  = (window - h % window) % window;
+    const int64_t npw = (w + px) / window;
+    const int64_t nph = (h + py) / window;
+
+    const int64_t b = x->ne[3] / (npw * nph);
+    ggml_tensor * cur = x;
+    cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+    cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b);
+    cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0);
+    cur = ggml_cont(ctx0, cur);
+    return cur;
+}
+
+static ggml_tensor * get_rel_pos(ggml_context * ctx0,
+                                 ggml_tensor *  rel_pos,  // [L, C]
+                                 ggml_tensor *  indices,  // [q_size, k_size]
+                                 const int      q_size,
+                                 const int      k_size) {
+    const int64_t C = rel_pos->ne[0];  // channels
+    const int64_t L = rel_pos->ne[1];  // length
+
+    GGML_ASSERT(indices != nullptr);
+    GGML_ASSERT(indices->type == GGML_TYPE_I32);
+    GGML_ASSERT(indices->ne[0] == k_size);
+    GGML_ASSERT(indices->ne[1] == q_size);
+
+    const auto    max_rel_dist = 2 * std::max(q_size, k_size) - 1;
+    ggml_tensor * cur          = rel_pos;
+
+    if (max_rel_dist != L) {
+        // Linear interpolation
+        const int64_t ne0 = cur->ne[0];
+        const int64_t ne1 = cur->ne[1];
+        const int64_t ne2 = cur->ne[2];
+        const int64_t ne3 = cur->ne[3];
+
+        cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3);
+        cur = ggml_reshape_4d(
+            ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR),
+            max_rel_dist, ne0, ne2, ne3);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
+    }
+
+    // Flatten indices to 1D for ggml_get_rows
+    const int qk = q_size * k_size;
+
+    cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size);
+
+    return cur;  // [C, k_size, q_size]
+}
+
+ggml_cgraph * clip_graph_deepseekocr::build() {
+    //patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+    //ggml_tensor * sam_out  = build_sam(inp_raw);
+
+    ggml_tensor * sam_out;
+    // Building SAM
+    {
+        const int n_embd  = hparams.sam_n_embd;
+        const int n_layer = hparams.sam_n_layer;
+        const int n_heads = hparams.sam_n_head;
+        const int d_heads = n_embd / n_heads;
+        const int window  = hparams.attn_window_size;
+
+        ggml_tensor * inpL;
+
+        inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
+        inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
+        inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
+
+        ggml_tensor * rel_pos_indices_local;
+        ggml_tensor * rel_pos_indices_global;
+
+        rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
+        rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
+        ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
+        ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
+        ggml_set_input(rel_pos_indices_local);
+        ggml_set_input(rel_pos_indices_global);
+
+        ggml_tensor * cur;
+        const auto    tgt_size = inpL->ne[1];
+        const auto    str_size = model.pos_embed->ne[1];
+
+        if (str_size != tgt_size) {
+            ggml_tensor * old_pos_embed = nullptr;
+            old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
+            ggml_tensor * new_pos_embed =
+                ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
+            new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
+            cur           = ggml_add(ctx0, inpL, new_pos_embed);
+        } else {
+            cur = ggml_add(ctx0, inpL, model.pos_embed);
+        }
+
+        // loop over layers
+        for (int il = 0; il < n_layer; il++) {
+            auto &        layer    = model.sam_layers[il];
+            ggml_tensor * shortcut = cur;
+
+            // layernorm1
+            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+
+            const int64_t w0 = cur->ne[1];
+            const int64_t h0 = cur->ne[2];
+
+            ggml_tensor * indices;
+
+            if (hparams.is_global_attn(il)) {
+                indices = rel_pos_indices_global;
+            } else {
+                // local attention layer - apply window partition
+                cur     = window_partition(ctx0, cur, window);
+                indices = rel_pos_indices_local;
+            }
+
+            const int64_t W = cur->ne[1];
+            const int64_t H = cur->ne[2];
+            // self-attention
+            {
+                const int B = cur->ne[3];
+
+                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+                cur = ggml_add(ctx0, cur, layer.qkv_b);
+                cur = ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
+                cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
+
+                ggml_tensor * Q;
+                ggml_tensor * K;
+                ggml_tensor * V;
+
+                Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
+                Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
+
+                K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
+                K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
+
+                V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
+                V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
+
+                ggml_tensor * mask;
+                ggml_tensor * rw;
+                ggml_tensor * rh;
+                ggml_tensor * qr;
+
+                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W);  // [W, W, C]
+                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H);  // [H, H, C]
+                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
+                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
+
+                rw   = ggml_mul_mat(ctx0, rw,
+                                    ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
+                rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3));                // [B*n_heads, H, W, W]
+                rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
+                rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
+                rh   = ggml_mul_mat(ctx0, rh, qr);  // [B*n_heads, H, W, H]
+                rh   = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
+                mask = ggml_add(ctx0, rw, rh);      // [B*n_heads, H*W, H, W]
+                mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
+                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
+
+                const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
+
+                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
+                                 il);  // [B, H*W, n_embd]
+                cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
+            }
+
+            if (hparams.is_global_attn(il) == false) {
+                // local attention layer - reverse window partition
+                cur = window_unpartition(ctx0, cur, w0, h0, window);
+            }
+
+            // re-add the layer input, e.g., residual
+            cur = ggml_add(ctx0, cur, shortcut);
+
+            ggml_tensor * inpFF = cur;
+
+            // layernorm2
+            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+
+            // ffn
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
+                            hparams.ffn_op, il);
+
+            // residual 2
+            cur = ggml_add(ctx0, cur, inpFF);
+            cb(cur, "sam_layer_out", il);
+        }
+
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+        cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+        cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+        cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+        cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
+
+        cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
+        cb(cur, "sam_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+        sam_out = cur;
+    }
+    //ggml_tensor * clip_out = build_dsocr_clip(sam_out);
+    ggml_tensor * clip_out;
+    // Building DS-OCR CLIP
+    {
+        ggml_tensor * inp;
+
+        inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
+        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+        ggml_tensor * new_pos_embd =
+            ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
+
+        int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
+        const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
+        const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
+
+        if (tgt_size != src_size) {
+            ggml_tensor * old_pos_embd;
+            ggml_tensor * cls_tok;
+
+            old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size,
+                                        ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0);
+            cls_tok      = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1,
+                                        ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size);
+            new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1,
+                                            GGML_SCALE_MODE_BICUBIC);
+            new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
+            new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
+            n_pos        = tgt_size * tgt_size + 1;
+        }
+
+        // add CLS token
+        inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+
+        // for selecting learned pos embd, used by ViT
+        ggml_tensor * positions        = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
+
+        ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
+
+        ggml_build_forward_expand(gf, cur);
+        clip_out = cur;
+    }
+
+    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
+
+    sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
+    sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
+    clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
+
+    ggml_tensor * cur;
+    cur = ggml_concat(ctx0, clip_out, sam_out, 0);
+    cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_mul_mat(ctx0, model.fc_w, cur);
+    cur = ggml_add(ctx0, cur, model.fc_b);
+
+    const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
+    const auto w     = h;
+    const auto n_dim = cur->ne[0];
+
+    ggml_tensor * imgnl;
+    ggml_tensor * vs;
+
+    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
+    vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
+    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
+    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
+    cur   = ggml_concat(ctx0, cur, vs, 1);  // (n_dim, h*(w+1) + 1)
+
+    cb(cur, "dsocr_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp
new file mode 100644
index 0000000000..9aded3b97c
--- /dev/null
+++ b/tools/mtmd/models/internvl.cpp
@@ -0,0 +1,69 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_internvl::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1;
+    ggml_tensor * inp = build_inp();
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // The larger models use a different ViT, which uses RMS norm instead of layer norm
+    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+    norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+        ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+        : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            norm_t,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            nullptr);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int height = n_patches_y;
+        const int width  = n_patches_x;
+        GGML_ASSERT(scale_factor > 0);
+        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            height / scale_factor,
+            width / scale_factor,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            cur->ne[1] * cur->ne[2]);
+    }
+
+    // projector (always using GELU activation)
+    {
+        // projector LayerNorm uses pytorch's default eps = 1e-5
+        // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+        cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_3_w, model.mm_3_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/kimivl.cpp b/tools/mtmd/models/kimivl.cpp
new file mode 100644
index 0000000000..0a06f5090e
--- /dev/null
+++ b/tools/mtmd/models/kimivl.cpp
@@ -0,0 +1,63 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_kimivl::build() {
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    {
+        // patch_merger
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection norm
+        int proj_inp_dim = cur->ne[0];
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, cur->ne[1] * scale_factor * scale_factor,
+            ggml_row_size(cur->type, n_embd), 0);
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        cur = ggml_view_2d(ctx0, cur,
+            proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
+            ggml_row_size(cur->type, proj_inp_dim), 0);
+        cb(cur, "proj_inp_normed", -1);
+
+        // projection mlp
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+        cb(cur, "proj_out", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/llama4.cpp b/tools/mtmd/models/llama4.cpp
new file mode 100644
index 0000000000..30d1df5bcd
--- /dev/null
+++ b/tools/mtmd/models/llama4.cpp
@@ -0,0 +1,96 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_llama4::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * inp = build_inp_raw();
+
+    // Llama4UnfoldConvolution
+    {
+        ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+                                                patch_size, patch_size, 3, n_embd);
+        inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+        cb(inp, "patch_conv", -1);
+    }
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+        // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            add_pos);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    // based on Llama4VisionPixelShuffleMLP
+    // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz = 1; // batch size, always 1 for now since we don't support batching
+        GGML_ASSERT(scale_factor > 0);
+        GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+        cur = ggml_reshape_4d(ctx0, cur,
+            n_embd * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y / scale_factor,
+            bsz);
+        //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches / scale_factor / scale_factor);
+        cb(cur, "pixel_shuffle", -1);
+    }
+
+    // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+    {
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cb(cur, "adapter_mlp", -1);
+    }
+
+    // Llama4MultiModalProjector
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cb(cur, "projected", -1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp
new file mode 100644
index 0000000000..0bfb5f05f6
--- /dev/null
+++ b/tools/mtmd/models/llava.cpp
@@ -0,0 +1,374 @@
+#include "models.h"
+
+// this graph is used by llava, granite and glm
+// due to having embedding_stack (used by granite), we cannot reuse build_vit
+ggml_cgraph * clip_graph_llava::build() {
+    const int batch_size = 1;
+    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+    // Calculate the deepest feature layer based on hparams and projector type
+    int max_feature_layer = n_layer;
+    {
+        // Get the index of the second to last layer; this is the default for models that have a llava projector
+        int il_last = hparams.n_layer - 1;
+        int deepest_feature_layer = -1;
+
+        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            il_last += 1;
+        }
+
+        // If we set explicit vision feature layers, only go up to the deepest one
+        // NOTE: only used by granite-vision models for now
+        for (const auto & feature_layer : hparams.vision_feature_layer) {
+            if (feature_layer > deepest_feature_layer) {
+                deepest_feature_layer = feature_layer;
+            }
+        }
+        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+    }
+
+    ggml_tensor * inp = build_inp();
+
+    // concat class_embeddings and patch_embeddings
+    if (model.class_embedding) {
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+    }
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+    ggml_tensor * inpL = inp;
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        cb(inpL, "pre_ln", -1);
+    }
+
+    std::vector<ggml_tensor *> embedding_stack;
+    const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+    // loop over layers
+    for (int il = 0; il < max_feature_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // If this is an embedding feature layer, save the output.
+        // NOTE: 0 index here refers to the input to the encoder.
+        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+            embedding_stack.push_back(cur);
+        }
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "layer_inp_normed", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+            if (layer.q_b) {
+                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+            }
+
+            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            if (layer.k_b) {
+                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+            }
+
+            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            if (layer.v_b) {
+                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+            }
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+    }
+
+    ggml_tensor * embeddings = inpL;
+
+    // process vision feature layers (used by granite)
+    {
+        // final layer is a vision feature layer
+        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+            embedding_stack.push_back(inpL);
+        }
+
+        // If feature layers are explicitly set, stack them (if we have multiple)
+        if (!embedding_stack.empty()) {
+            embeddings = embedding_stack[0];
+            for (size_t i = 1; i < embedding_stack.size(); i++) {
+                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+            }
+        }
+    }
+
+    // llava projector (also used by granite)
+    if (hparams.has_llava_projector) {
+        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(patches, "patches");
+        ggml_set_input(patches);
+
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
+        embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+        // print_tensor_info(embeddings, "embeddings");
+
+        // llava projector
+        if (proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+            embeddings = ggml_gelu(ctx0, embeddings);
+            if (model.mm_2_w) {
+                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            }
+        }
+        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
+        else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    // glm projector
+    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+        // GLU
+        {
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            embeddings = ggml_gelu_inplace(ctx0, embeddings);
+            ggml_tensor * x = embeddings;
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+        }
+        // arrangement of BOI/EOI token embeddings
+        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+        {
+            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
+            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
+        }
+    }
+
+    else {
+        GGML_ABORT("llava: unknown projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/minicpmv.cpp b/tools/mtmd/models/minicpmv.cpp
new file mode 100644
index 0000000000..3594ea29fa
--- /dev/null
+++ b/tools/mtmd/models/minicpmv.cpp
@@ -0,0 +1,114 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_minicpmv::build() {
+    GGML_ASSERT(model.class_embedding == nullptr);
+    const int n_pos       = n_patches;
+    const int n_embd_proj = n_mmproj_embd;
+
+    // position embeddings for the projector (not for ViT)
+    // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+    // base frequency omega
+    ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+    ggml_set_name(omega, "omega");
+    ggml_set_input(omega);
+
+    // 2D input positions (using float for sinusoidal embeddings)
+    ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+    ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    // for selecting learned pos embd, used by ViT
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * embeddings = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    // resampler projector (it is just another transformer)
+
+    ggml_tensor * q = model.mm_model_query;
+    ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+    // norm
+    q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
+    v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // calculate sinusoidal pos embd
+    ggml_tensor * pos_embed = nullptr;
+    {
+        // outer product
+        ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+        ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+        ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+        // sin and cos
+        ggml_tensor * pos_embd_x = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_x),
+            ggml_cos(ctx0, theta_x),
+            0 // concat on first dim
+        );
+        ggml_tensor * pos_embd_y = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_y),
+            ggml_cos(ctx0, theta_y),
+            0 // concat on first dim
+        );
+        pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+    }
+
+    // k = v + pos_embed
+    ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+    // attention
+    {
+        const int d_head = 128;
+        int n_head = n_embd_proj/d_head;
+        // Use actual config value if available, otherwise fall back to hardcoded values
+        int num_query = hparams.minicpmv_query_num;
+        ggml_tensor * Q = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+            model.mm_model_attn_q_b);
+        ggml_tensor * K = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+            model.mm_model_attn_k_b);
+        ggml_tensor * V = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+            model.mm_model_attn_v_b);
+
+        Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+        K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+        V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+        cb(Q, "resampler_Q", -1);
+        cb(K, "resampler_K", -1);
+        cb(V, "resampler_V", -1);
+
+        float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
+        embeddings = build_attn(
+            model.mm_model_attn_o_w,
+            model.mm_model_attn_o_b,
+            Q, K, V, nullptr, resampler_kq_scale, -1);
+        cb(embeddings, "resampler_attn_out", -1);
+    }
+    // layernorm
+    embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // projection
+    embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
new file mode 100644
index 0000000000..bf020516fb
--- /dev/null
+++ b/tools/mtmd/models/models.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "../clip-graph.h"
+
+struct clip_graph_siglip : clip_graph {
+    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_pixtral : clip_graph {
+    clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen2vl : clip_graph {
+    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen3vl : clip_graph {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_minicpmv : clip_graph {
+    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_internvl : clip_graph {
+    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llama4 : clip_graph {
+    clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_kimivl : clip_graph {
+    clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_cogvlm : clip_graph {
+    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llava : clip_graph {
+    clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_whisper_enc : clip_graph {
+    clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_deepseekocr : clip_graph {
+    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
diff --git a/tools/mtmd/models/pixtral.cpp b/tools/mtmd/models/pixtral.cpp
new file mode 100644
index 0000000000..a849210b53
--- /dev/null
+++ b/tools/mtmd/models/pixtral.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_pixtral::build() {
+    const int n_merge = hparams.n_merge;
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_RMS,
+                            hparams.ffn_op,
+                            nullptr, // no learned pos embd
+                            add_pos);
+
+    // mistral small 3.1 patch merger
+    // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+    if (model.mm_patch_merger_w) {
+        GGML_ASSERT(hparams.n_merge > 0);
+
+        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
+
+        // reshape image tokens to 2D grid
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+        cur = ggml_cont(ctx0, cur);
+
+        // torch.nn.functional.unfold is just an im2col under the hood
+        // we just need a dummy kernel to make it work
+        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+        // project to n_embd
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+    }
+
+    // LlavaMultiModalProjector (always using GELU activation)
+    {
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // arrangement of the [IMG_BREAK] token
+    if (model.token_embd_img_break) {
+        // not efficient, but works
+        // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
+
+        const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+        const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+        const int p_total         = p_x * p_y;
+        const int n_embd_text     = cur->ne[0];
+        const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
+
+        ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+        tmp = ggml_concat(ctx0, tmp, tok, 1);
+        cur = ggml_view_2d(ctx0, tmp,
+            n_embd_text, n_tokens_output,
+            ggml_row_size(tmp->type, n_embd_text), 0);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp
new file mode 100644
index 0000000000..85f158bb1c
--- /dev/null
+++ b/tools/mtmd/models/qwen2vl.cpp
@@ -0,0 +1,183 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen2vl::build() {
+    GGML_ASSERT(model.patch_bias == nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const bool use_window_attn = hparams.n_wa_pattern > 0;
+    const int n_wa_pattern     = hparams.n_wa_pattern;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
+        ? NORM_TYPE_RMS // qwen 2.5 vl
+        : NORM_TYPE_NORMAL; // qwen 2 vl
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    ggml_tensor * inpL           = inp;
+    ggml_tensor * window_mask    = nullptr;
+    ggml_tensor * window_idx     = nullptr;
+    ggml_tensor * inv_window_idx = nullptr;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    if (use_window_attn) {
+        // handle window attention inputs
+        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(inv_window_idx, "inv_window_idx");
+        ggml_set_input(inv_window_idx);
+        // mask for window attention
+        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+        ggml_set_name(window_mask, "window_mask");
+        ggml_set_input(window_mask);
+
+        // if flash attn is used, we need to pad the mask and cast to f16
+        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+        }
+
+        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        const auto & layer = model.layers[il];
+        const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+            ggml_tensor * Kcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+            ggml_tensor * Vcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+    embeddings = build_ffn(embeddings,
+                        model.mm_0_w, model.mm_0_b,
+                        nullptr, nullptr,
+                        model.mm_1_w, model.mm_1_b,
+                        FFN_GELU,
+                        -1);
+
+    if (use_window_attn) {
+        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(window_idx, "window_idx");
+        ggml_set_input(window_idx);
+
+        // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
new file mode 100644
index 0000000000..35a42cb84d
--- /dev/null
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -0,0 +1,191 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen3vl::build() {
+    GGML_ASSERT(model.patch_bias != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = NORM_TYPE_NORMAL;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // add patch bias
+    if (model.patch_bias != nullptr) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+
+    // calculate absolute position embedding and apply
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+    learned_pos_embd = ggml_cont_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+    learned_pos_embd = ggml_reshape_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+    learned_pos_embd = ggml_cont_3d(
+        ctx0, learned_pos_embd,
+        n_embd, n_patches_x * n_patches_y, batch_size);
+    inp = ggml_add(ctx0, inp, learned_pos_embd);
+    cb(inp, "inp_pos_emb", -1);
+
+    ggml_tensor * inpL = inp;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
+    ggml_tensor * deepstack_features = nullptr;
+    const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+            ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ 0);
+
+            ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, n_embd));
+
+            ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        if (layer.has_deepstack()) {
+            ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
+            feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
+            feat = build_ffn(feat,
+                layer.deepstack_fc1_w, layer.deepstack_fc1_b,
+                nullptr, nullptr,
+                layer.deepstack_fc2_w, layer.deepstack_fc2_b,
+                ffn_op_type::FFN_GELU, il);
+
+            if(!deepstack_features) {
+                deepstack_features = feat;
+            } else {
+                // concat along the feature dimension
+                deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+            }
+        }
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+
+    embeddings = build_ffn(embeddings,
+        model.mm_0_w, model.mm_0_b,
+        nullptr, nullptr,
+        model.mm_1_w, model.mm_1_b,
+        ffn_op_type::FFN_GELU, -1);
+
+    embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp
new file mode 100644
index 0000000000..191694235d
--- /dev/null
+++ b/tools/mtmd/models/siglip.cpp
@@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_siglip::build() {
+    ggml_tensor * inp = build_inp();
+
+    ggml_tensor * learned_pos_embd = model.position_embeddings;
+    if (proj_type == PROJECTOR_TYPE_LFM2) {
+        learned_pos_embd = resize_position_embeddings();
+    }
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        const int batch_size = 1;
+        GGML_ASSERT(n_patches_x == n_patches_y);
+        const int patches_per_image = n_patches_x;
+        const int kernel_size = hparams.n_merge;
+
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+
+        // doing a pool2d to reduce the number of output tokens
+        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        // apply norm before projection
+        cur = ggml_rms_norm(ctx0, cur, eps);
+        cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+
+        // apply projection
+        cur = ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+            cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        // pixel_shuffle
+        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+        cur = ggml_mul_mat(ctx0, model.fc_w, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_LFM2) {
+        // pixel unshuffle block
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr,
+            model.mm_1_w, model.mm_1_b,
+            hparams.ffn_op,
+            -1);
+
+    } else {
+        GGML_ABORT("SigLIP: Unsupported projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp
new file mode 100644
index 0000000000..07d378b095
--- /dev/null
+++ b/tools/mtmd/models/whisper-enc.cpp
@@ -0,0 +1,107 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_whisper_enc::build() {
+    const int n_frames = img.nx;
+    const int n_pos    = n_frames / 2;
+    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+    ggml_tensor * inp = build_inp_raw(1);
+
+    // conv1d block
+    {
+        // convolution + gelu
+        ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+        // transpose
+        inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cb(inp, "after_conv1d", -1);
+    }
+
+    // sanity check (only check one layer, but it should be the same for all)
+    GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+    GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+    GGML_ASSERT(model.layers[0].q_b);
+    GGML_ASSERT(model.layers[0].v_b);
+    GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+    GGML_ASSERT(model.post_ln_w && model.post_ln_b);
+
+    ggml_tensor * pos_embd_selected = ggml_view_2d(
+        ctx0, model.position_embeddings,
+        model.position_embeddings->ne[0], n_pos,
+        model.position_embeddings->nb[1], 0
+    );
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            pos_embd_selected,
+                            nullptr);
+
+    cb(cur, "after_transformer", -1);
+
+    if (model.audio_has_stack_frames()) {
+        // StackAudioFrames
+        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+        int64_t stride = n_embd * hparams.proj_stack_factor;
+        int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
+        int64_t pad = padded_len - ggml_nelements(cur);
+        if (pad > 0) {
+            cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
+            cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
+        }
+        cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
+                            ggml_row_size(cur->type, stride), 0);
+        cb(cur, "after_stacked", -1);
+    }
+
+    if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+        // UltravoxProjector
+        // pre-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+        // ffn in
+        cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+        // swiglu
+        // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+        cur = ggml_swiglu_swapped(ctx0, cur);
+
+        // mid-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+        // ffn out
+        cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
+        // projector
+        cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+        cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
+        // projector
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU_ERF,
+            -1);
+
+    } else {
+        GGML_ABORT("%s: unknown projector type", __func__);
+    }
+
+    cb(cur, "projected", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
index b7b940affb..0e552347a0 100644
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -6,6 +6,8 @@
 #include <vector>
 #include <string>
 
+#define MTMD_INTERNAL_HEADER
+
 #define WHISPER_ASSERT GGML_ASSERT
 
 #define WHISPER_SAMPLE_RATE 16000
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index f0891bba30..902a4b456d 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -32,6 +32,10 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"
 
+#ifdef MTMD_INTERNAL_HEADER
+#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
+#endif
+
 //
 // internal logging functions
 //
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index b3df24c299..9f7e861e92 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -22,6 +22,11 @@
  *          Issues related to API usage may receive lower priority support.
  *
  * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
  */
 
 #ifdef LLAMA_SHARED
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 20a9359bed..80db07837a 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -107,6 +107,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
     add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
     # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
     # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
+    add_test_vision "sabafallah/DeepSeek-OCR-GGUF:q8_0" -p "Free OCR." --chat-template deepseek-ocr
 
     add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
     add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
diff --git a/tools/server/README.md b/tools/server/README.md
index d6b9b87dcf..073bcd2ccd 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
 | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape` | do not process escape sequences |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
 | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
 | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
@@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
 | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
-| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
@@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
 | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
 | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
 | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
@@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | -------- | ----------- |
 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
 | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
 | `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
-| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
-| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
-| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
 | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
@@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
 | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
+| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
-| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
-| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
-| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
-| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
-| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
@@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 
+For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
+- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
+- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
+- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
+
 Example usage of docker compose with environment variables:
 
 ```yml
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 2db04e9522..3fd631b77a 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index 9c1c2499cf..4f37b308b1 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -41,7 +41,7 @@
 				"@tailwindcss/vite": "^4.0.0",
 				"@types/node": "^22",
 				"@vitest/browser": "^3.2.3",
-				"bits-ui": "^2.8.11",
+				"bits-ui": "^2.14.4",
 				"clsx": "^2.1.1",
 				"dexie": "^4.0.11",
 				"eslint": "^9.18.0",
@@ -3343,17 +3343,17 @@
 			}
 		},
 		"node_modules/bits-ui": {
-			"version": "2.8.11",
-			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
-			"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
+			"version": "2.14.4",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"@floating-ui/core": "^1.7.1",
 				"@floating-ui/dom": "^1.7.1",
 				"esm-env": "^1.1.2",
-				"runed": "^0.29.1",
-				"svelte-toolbelt": "^0.9.3",
+				"runed": "^0.35.1",
+				"svelte-toolbelt": "^0.10.6",
 				"tabbable": "^6.2.0"
 			},
 			"engines": {
@@ -3368,9 +3368,9 @@
 			}
 		},
 		"node_modules/bits-ui/node_modules/runed": {
-			"version": "0.29.2",
-			"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
-			"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
+			"version": "0.35.1",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte",
@@ -3378,23 +3378,31 @@
 			],
 			"license": "MIT",
 			"dependencies": {
-				"esm-env": "^1.0.0"
+				"dequal": "^2.0.3",
+				"esm-env": "^1.0.0",
+				"lz-string": "^1.5.0"
 			},
 			"peerDependencies": {
+				"@sveltejs/kit": "^2.21.0",
 				"svelte": "^5.7.0"
+			},
+			"peerDependenciesMeta": {
+				"@sveltejs/kit": {
+					"optional": true
+				}
 			}
 		},
 		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
-			"version": "0.9.3",
-			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
-			"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
+			"version": "0.10.6",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte"
 			],
 			"dependencies": {
 				"clsx": "^2.1.1",
-				"runed": "^0.29.0",
+				"runed": "^0.35.1",
 				"style-to-object": "^1.0.8"
 			},
 			"engines": {
diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json
index 987a7239ed..c20ab3cfde 100644
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -43,7 +43,7 @@
 		"@tailwindcss/vite": "^4.0.0",
 		"@types/node": "^22",
 		"@vitest/browser": "^3.2.3",
-		"bits-ui": "^2.8.11",
+		"bits-ui": "^2.14.4",
 		"clsx": "^2.1.1",
 		"dexie": "^4.0.11",
 		"eslint": "^9.18.0",
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 7f8e38286d..78cc1c47da 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -331,6 +331,7 @@
 	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
 		? 'cursor-not-allowed opacity-60'
 		: ''} {className}"
+	data-slot="chat-form"
 >
 	<ChatAttachmentsList
 		bind:uploadedFiles
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
index c9e6c6616a..afc9847028 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
@@ -1,6 +1,5 @@
 <script lang="ts">
-	import { Input } from '$lib/components/ui/input';
-	import { Search } from '@lucide/svelte';
+	import { SearchInput } from '$lib/components/app';
 
 	interface Props {
 		value?: string;
@@ -15,19 +14,6 @@
 		onInput,
 		class: className
 	}: Props = $props();
-
-	function handleInput(event: Event) {
-		const target = event.target as HTMLInputElement;
-
-		value = target.value;
-		onInput?.(target.value);
-	}
 </script>
 
-<div class="relative mb-4 {className}">
-	<Search
-		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
-	/>
-
-	<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
-</div>
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
diff --git a/tools/server/webui/src/lib/components/app/index.ts b/tools/server/webui/src/lib/components/app/index.ts
index 87b24598b7..8631d4fb3b 100644
--- a/tools/server/webui/src/lib/components/app/index.ts
+++ b/tools/server/webui/src/lib/components/app/index.ts
@@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
 export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
 export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
 export { default as RemoveButton } from './misc/RemoveButton.svelte';
+export { default as SearchInput } from './misc/SearchInput.svelte';
 export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
 export { default as ModelsSelector } from './models/ModelsSelector.svelte';
 
diff --git a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
new file mode 100644
index 0000000000..15cd6abaa9
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
@@ -0,0 +1,73 @@
+<script lang="ts">
+	import { Input } from '$lib/components/ui/input';
+	import { Search, X } from '@lucide/svelte';
+
+	interface Props {
+		value?: string;
+		placeholder?: string;
+		onInput?: (value: string) => void;
+		onClose?: () => void;
+		onKeyDown?: (event: KeyboardEvent) => void;
+		class?: string;
+		id?: string;
+		ref?: HTMLInputElement | null;
+	}
+
+	let {
+		value = $bindable(''),
+		placeholder = 'Search...',
+		onInput,
+		onClose,
+		onKeyDown,
+		class: className,
+		id,
+		ref = $bindable(null)
+	}: Props = $props();
+
+	let showClearButton = $derived(!!value || !!onClose);
+
+	function handleInput(event: Event) {
+		const target = event.target as HTMLInputElement;
+
+		value = target.value;
+		onInput?.(target.value);
+	}
+
+	function handleClear() {
+		if (value) {
+			value = '';
+			onInput?.('');
+			ref?.focus();
+		} else {
+			onClose?.();
+		}
+	}
+</script>
+
+<div class="relative {className}">
+	<Search
+		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
+	/>
+
+	<Input
+		{id}
+		bind:value
+		bind:ref
+		class="pl-9 {showClearButton ? 'pr-9' : ''}"
+		oninput={handleInput}
+		onkeydown={onKeyDown}
+		{placeholder}
+		type="search"
+	/>
+
+	{#if showClearButton}
+		<button
+			type="button"
+			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
+			onclick={handleClear}
+			aria-label={value ? 'Clear search' : 'Close'}
+		>
+			<X class="h-4 w-4" />
+		</button>
+	{/if}
+</div>
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
index c4331e92f1..ac0937696d 100644
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@@ -2,8 +2,8 @@
 	import { onMount, tick } from 'svelte';
 	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
+	import * as Popover from '$lib/components/ui/popover';
 	import { cn } from '$lib/components/ui/utils';
-	import { portalToBody } from '$lib/utils';
 	import {
 		modelsStore,
 		modelOptions,
@@ -17,12 +17,8 @@
 	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
 	import { ServerModelStatus } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
-	import { DialogModelInformation } from '$lib/components/app';
-	import {
-		MENU_MAX_WIDTH,
-		MENU_OFFSET,
-		VIEWPORT_GUTTER
-	} from '$lib/constants/floating-ui-constraints';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
+	import type { ModelOption } from '$lib/types/models';
 
 	interface Props {
 		class?: string;
@@ -145,185 +141,126 @@
 		return options.some((option) => option.model === currentModel);
 	});
 
-	let isOpen = $state(false);
-	let showModelDialog = $state(false);
-	let container: HTMLDivElement | null = null;
-	let menuRef = $state<HTMLDivElement | null>(null);
-	let triggerButton = $state<HTMLButtonElement | null>(null);
-	let menuPosition = $state<{
-		top: number;
-		left: number;
-		width: number;
-		placement: 'top' | 'bottom';
-		maxHeight: number;
-	} | null>(null);
+	let searchTerm = $state('');
+	let searchInputRef = $state<HTMLInputElement | null>(null);
+	let highlightedIndex = $state<number>(-1);
 
-	onMount(async () => {
-		try {
-			await modelsStore.fetch();
-		} catch (error) {
-			console.error('Unable to load models:', error);
-		}
+	let filteredOptions: ModelOption[] = $derived(
+		(() => {
+			const term = searchTerm.trim().toLowerCase();
+			if (!term) return options;
+
+			return options.filter(
+				(option) =>
+					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
+			);
+		})()
+	);
+
+	// Get indices of compatible options for keyboard navigation
+	let compatibleIndices = $derived(
+		filteredOptions
+			.map((option, index) => (isModelCompatible(option) ? index : -1))
+			.filter((i) => i !== -1)
+	);
+
+	// Reset highlighted index when search term changes
+	$effect(() => {
+		void searchTerm;
+		highlightedIndex = -1;
 	});
 
-	function toggleOpen() {
+	let isOpen = $state(false);
+	let showModelDialog = $state(false);
+
+	onMount(() => {
+		modelsStore.fetch().catch((error) => {
+			console.error('Unable to load models:', error);
+		});
+	});
+
+	function handleOpenChange(open: boolean) {
 		if (loading || updating) return;
 
-		if (isRouter) {
-			// Router mode: show dropdown
-			if (isOpen) {
-				closeMenu();
-			} else {
-				openMenu();
+		if (open) {
+			isOpen = true;
+			searchTerm = '';
+			highlightedIndex = -1;
+
+			// Focus search input after popover opens
+			tick().then(() => {
+				requestAnimationFrame(() => searchInputRef?.focus());
+			});
+
+			if (isRouter) {
+				modelsStore.fetchRouterModels().then(() => {
+					modelsStore.fetchModalitiesForLoadedModels();
+				});
 			}
 		} else {
-			// Single model mode: show dialog
-			showModelDialog = true;
+			isOpen = false;
+			searchTerm = '';
+			highlightedIndex = -1;
 		}
 	}
 
-	async function openMenu() {
+	function handleTriggerClick() {
 		if (loading || updating) return;
 
-		isOpen = true;
-		await tick();
-		updateMenuPosition();
-		requestAnimationFrame(() => updateMenuPosition());
-
-		if (isRouter) {
-			modelsStore.fetchRouterModels().then(() => {
-				modelsStore.fetchModalitiesForLoadedModels();
-			});
+		if (!isRouter) {
+			// Single model mode: show dialog instead of popover
+			showModelDialog = true;
 		}
+		// For router mode, the Popover handles open/close
 	}
 
 	export function open() {
 		if (isRouter) {
-			openMenu();
+			handleOpenChange(true);
 		} else {
 			showModelDialog = true;
 		}
 	}
 
 	function closeMenu() {
-		if (!isOpen) return;
-
-		isOpen = false;
-		menuPosition = null;
+		handleOpenChange(false);
 	}
 
-	function handlePointerDown(event: PointerEvent) {
-		if (!container) return;
+	function handleSearchKeyDown(event: KeyboardEvent) {
+		if (event.isComposing) return;
 
-		const target = event.target as Node | null;
+		if (event.key === 'ArrowDown') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
 
-		if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
-			closeMenu();
-		}
-	}
-
-	function handleKeydown(event: KeyboardEvent) {
-		if (event.key === 'Escape') {
-			closeMenu();
-		}
-	}
-
-	function handleResize() {
-		if (isOpen) {
-			updateMenuPosition();
-		}
-	}
-
-	function updateMenuPosition() {
-		if (!isOpen || !triggerButton || !menuRef) return;
-
-		const triggerRect = triggerButton.getBoundingClientRect();
-		const viewportWidth = window.innerWidth;
-		const viewportHeight = window.innerHeight;
-
-		if (viewportWidth === 0 || viewportHeight === 0) return;
-
-		const scrollWidth = menuRef.scrollWidth;
-		const scrollHeight = menuRef.scrollHeight;
-
-		const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
-		const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
-		const safeMaxWidth =
-			constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
-		const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
-
-		let width = Math.min(
-			Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
-			safeMaxWidth || 320
-		);
-
-		const availableBelow = Math.max(
-			0,
-			viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
-		);
-		const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
-		const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
-		const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
-
-		function computePlacement(placement: 'top' | 'bottom') {
-			const available = placement === 'bottom' ? availableBelow : availableAbove;
-			const allowedHeight =
-				available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
-			const maxHeight = Math.min(scrollHeight, allowedHeight);
-			const height = Math.max(0, maxHeight);
-
-			let top: number;
-			if (placement === 'bottom') {
-				const rawTop = triggerRect.bottom + MENU_OFFSET;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.min(Math.max(rawTop, minTop), maxTop);
-				}
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
+				highlightedIndex = compatibleIndices[0];
 			} else {
-				const rawTop = triggerRect.top - MENU_OFFSET - height;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.max(Math.min(rawTop, maxTop), minTop);
+				highlightedIndex = compatibleIndices[currentPos + 1];
+			}
+		} else if (event.key === 'ArrowUp') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
+
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === 0) {
+				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
+			} else {
+				highlightedIndex = compatibleIndices[currentPos - 1];
+			}
+		} else if (event.key === 'Enter') {
+			event.preventDefault();
+			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
+				const option = filteredOptions[highlightedIndex];
+				if (isModelCompatible(option)) {
+					handleSelect(option.id);
 				}
-			}
-
-			return { placement, top, height, maxHeight };
-		}
-
-		const belowMetrics = computePlacement('bottom');
-		const aboveMetrics = computePlacement('top');
-
-		let metrics = belowMetrics;
-		if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
-			metrics = aboveMetrics;
-		}
-
-		let left = triggerRect.right - width;
-		const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
-		if (maxLeft < VIEWPORT_GUTTER) {
-			left = VIEWPORT_GUTTER;
-		} else {
-			if (left > maxLeft) {
-				left = maxLeft;
-			}
-			if (left < VIEWPORT_GUTTER) {
-				left = VIEWPORT_GUTTER;
+			} else if (compatibleIndices.length > 0) {
+				// No selection - highlight first compatible option
+				highlightedIndex = compatibleIndices[0];
 			}
 		}
-
-		menuPosition = {
-			top: Math.round(metrics.top),
-			left: Math.round(left),
-			width: Math.round(width),
-			placement: metrics.placement,
-			maxHeight: Math.round(metrics.maxHeight)
-		};
 	}
 
 	async function handleSelect(modelId: string) {
@@ -356,6 +293,14 @@
 
 		if (shouldCloseMenu) {
 			closeMenu();
+
+			// Focus the chat textarea after model selection
+			requestAnimationFrame(() => {
+				const textarea = document.querySelector<HTMLTextAreaElement>(
+					'[data-slot="chat-form"] textarea'
+				);
+				textarea?.focus();
+			});
 		}
 	}
 
@@ -404,10 +349,7 @@
 	}
 </script>
 
-<svelte:window onresize={handleResize} />
-<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
-
-<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
@@ -418,9 +360,8 @@
 	{:else}
 		{@const selectedOption = getDisplayOption()}
 
-		<div class="relative">
-			<button
-				type="button"
+		<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
+			<Popover.Trigger
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
 					!isCurrentModelInCache()
@@ -430,15 +371,11 @@
 							: isHighlightedCurrentModelActive
 								? 'text-foreground'
 								: 'text-muted-foreground',
-					isOpen ? 'text-foreground' : '',
-					className
+					isOpen ? 'text-foreground' : ''
 				)}
 				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
-				aria-haspopup={isRouter ? 'listbox' : undefined}
-				aria-expanded={isRouter ? isOpen : undefined}
-				onclick={toggleOpen}
-				bind:this={triggerButton}
-				disabled={disabled || updating}
+				onclick={handleTriggerClick}
+				disabled={disabled || updating || !isRouter}
 			>
 				<Package class="h-3.5 w-3.5" />
 
@@ -451,33 +388,35 @@
 				{:else if isRouter}
 					<ChevronDown class="h-3 w-3.5" />
 				{/if}
-			</button>
+			</Popover.Trigger>
 
-			{#if isOpen && isRouter}
-				<div
-					bind:this={menuRef}
-					use:portalToBody
-					class={cn(
-						'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
-						menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
-					)}
-					role="listbox"
-					style:top={menuPosition ? `${menuPosition.top}px` : undefined}
-					style:left={menuPosition ? `${menuPosition.left}px` : undefined}
-					style:width={menuPosition ? `${menuPosition.width}px` : undefined}
-					data-placement={menuPosition?.placement ?? 'bottom'}
-				>
+			<Popover.Content
+				class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
+				align="end"
+				sideOffset={8}
+				collisionPadding={16}
+			>
+				<div class="flex max-h-[50dvh] flex-col overflow-hidden">
 					<div
-						class="overflow-y-auto py-1"
-						style:max-height={menuPosition && menuPosition.maxHeight > 0
-							? `${menuPosition.maxHeight}px`
-							: undefined}
+						class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
+					>
+						<SearchInput
+							id="model-search"
+							placeholder="Search models..."
+							bind:value={searchTerm}
+							bind:ref={searchInputRef}
+							onClose={closeMenu}
+							onKeyDown={handleSearchKeyDown}
+						/>
+					</div>
+					<div
+						class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
 					>
 						{#if !isCurrentModelInCache() && currentModel}
 							<!-- Show unavailable model as first option (disabled) -->
 							<button
 								type="button"
-								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
+								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
 								role="option"
 								aria-selected="true"
 								aria-disabled="true"
@@ -488,20 +427,25 @@
 							</button>
 							<div class="my-1 h-px bg-border"></div>
 						{/if}
-						{#each options as option (option.id)}
+						{#if filteredOptions.length === 0}
+							<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
+						{/if}
+						{#each filteredOptions as option, index (option.id)}
 							{@const status = getModelStatus(option.model)}
 							{@const isLoaded = status === ServerModelStatus.LOADED}
 							{@const isLoading = status === ServerModelStatus.LOADING}
 							{@const isSelected = currentModel === option.model || activeId === option.id}
 							{@const isCompatible = isModelCompatible(option)}
+							{@const isHighlighted = index === highlightedIndex}
 							{@const missingModalities = getMissingModalities(option)}
+
 							<div
 								class={cn(
-									'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
+									'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
 									isCompatible
 										? 'cursor-pointer hover:bg-muted focus:bg-muted'
 										: 'cursor-not-allowed opacity-50',
-									isSelected
+									isSelected || isHighlighted
 										? 'bg-accent text-accent-foreground'
 										: isCompatible
 											? 'hover:bg-accent hover:text-accent-foreground'
@@ -509,10 +453,11 @@
 									isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
 								)}
 								role="option"
-								aria-selected={isSelected}
+								aria-selected={isSelected || isHighlighted}
 								aria-disabled={!isCompatible}
 								tabindex={isCompatible ? 0 : -1}
 								onclick={() => isCompatible && handleSelect(option.id)}
+								onmouseenter={() => (highlightedIndex = index)}
 								onkeydown={(e) => {
 									if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
 										e.preventDefault();
@@ -586,8 +531,8 @@
 						{/each}
 					</div>
 				</div>
-			{/if}
-		</div>
+			</Popover.Content>
+		</Popover.Root>
 	{/if}
 </div>
 
diff --git a/tools/server/webui/src/lib/components/ui/popover/index.ts b/tools/server/webui/src/lib/components/ui/popover/index.ts
new file mode 100644
index 0000000000..c5937fb3a0
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/index.ts
@@ -0,0 +1,19 @@
+import Root from './popover.svelte';
+import Close from './popover-close.svelte';
+import Content from './popover-content.svelte';
+import Trigger from './popover-trigger.svelte';
+import Portal from './popover-portal.svelte';
+
+export {
+	Root,
+	Content,
+	Trigger,
+	Close,
+	Portal,
+	//
+	Root as Popover,
+	Content as PopoverContent,
+	Trigger as PopoverTrigger,
+	Close as PopoverClose,
+	Portal as PopoverPortal
+};
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
new file mode 100644
index 0000000000..dc4dec4b33
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
+</script>
+
+<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
new file mode 100644
index 0000000000..2d3513d347
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
@@ -0,0 +1,37 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+	import PopoverPortal from './popover-portal.svelte';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 4,
+		side,
+		align = 'center',
+		collisionPadding = 8,
+		avoidCollisions = true,
+		portalProps,
+		...restProps
+	}: PopoverPrimitive.ContentProps & {
+		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
+	} = $props();
+</script>
+
+<PopoverPortal {...portalProps}>
+	<PopoverPrimitive.Content
+		bind:ref
+		data-slot="popover-content"
+		{sideOffset}
+		{side}
+		{align}
+		{collisionPadding}
+		{avoidCollisions}
+		class={cn(
+			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	/>
+</PopoverPortal>
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
new file mode 100644
index 0000000000..25efb877b7
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
+</script>
+
+<PopoverPrimitive.Portal {...restProps} />
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
new file mode 100644
index 0000000000..5ef3d0e932
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: PopoverPrimitive.TriggerProps = $props();
+</script>
+
+<PopoverPrimitive.Trigger
+	bind:ref
+	data-slot="popover-trigger"
+	class={cn('', className)}
+	{...restProps}
+/>
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover.svelte b/tools/server/webui/src/lib/components/ui/popover/popover.svelte
new file mode 100644
index 0000000000..f39b867a69
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
+</script>
+
+<PopoverPrimitive.Root bind:open {...restProps} />
diff --git a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
index c95d3f1841..003fc77acb 100644
--- a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
+++ b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
@@ -1,3 +1,2 @@
 export const VIEWPORT_GUTTER = 8;
 export const MENU_OFFSET = 6;
-export const MENU_MAX_WIDTH = 320;
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index 29416c2fe5..34b26403e4 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -295,14 +295,21 @@ class ModelsStore {
 	 * Fetch props for a specific model from /props endpoint
 	 * Uses caching to avoid redundant requests
 	 *
+	 * In ROUTER mode, this will only fetch props if the model is loaded,
+	 * since unloaded models return 400 from /props endpoint.
+	 *
 	 * @param modelId - Model identifier to fetch props for
-	 * @returns Props data or null if fetch failed
+	 * @returns Props data or null if fetch failed or model not loaded
 	 */
 	async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
 		// Return cached props if available
 		const cached = this.modelPropsCache.get(modelId);
 		if (cached) return cached;
 
+		if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
+			return null;
+		}
+
 		// Avoid duplicate fetches
 		if (this.modelPropsFetching.has(modelId)) return null;
 
diff --git a/tools/server/webui/src/lib/utils/latex-protection.test.ts b/tools/server/webui/src/lib/utils/latex-protection.test.ts
index 2354f8fa0e..40fe1b0db2 100644
--- a/tools/server/webui/src/lib/utils/latex-protection.test.ts
+++ b/tools/server/webui/src/lib/utils/latex-protection.test.ts
@@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
 		expect(output).toBe(input); // Code blocks prevent misinterpretation
 	});
 
+	test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
+		const input = '```python\nfoo = "\\(bar\\)"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash brackets in code blocks', () => {
+		const input = '```python\nfoo = "\\[bar\\]"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash parentheses in inline code', () => {
+		const input = 'Use `foo = "\\(bar\\)"` in your code.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input);
+	});
+
 	test('escape backslash in mchem ce', () => {
 		const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
 		const output = preprocessLaTeX(input);
diff --git a/tools/server/webui/src/lib/utils/latex-protection.ts b/tools/server/webui/src/lib/utils/latex-protection.ts
index 7f5cf2cddf..cafa2d4761 100644
--- a/tools/server/webui/src/lib/utils/latex-protection.ts
+++ b/tools/server/webui/src/lib/utils/latex-protection.ts
@@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
 		return expr;
 	});
 
-	// Step 5: Restore code blocks
-	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
-		return codeBlocks[parseInt(index)];
-	});
-
-	// Step 6: Apply additional escaping functions (brackets and mhchem)
+	// Step 5: Apply additional escaping functions (brackets and mhchem)
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = escapeBrackets(content);
 
 	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
 		content = escapeMhchem(content);
 	}
 
-	// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
+	// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = content
 		// Using the look‑behind pattern `(?<!\\)` we skip matches
 		// that are preceded by a backslash, e.g.
@@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
 			// Using the look‑behind pattern `(?<!\\)` we skip matches
 			// that are preceded by a backslash, e.g. `\\[4pt]`.
 			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
-			(_, prefix: string, content: string) => {
-				return `${prefix}$$${content}$$`;
+			(_, content: string) => {
+				return `$$${content}$$`;
 			}
 		);
 
-	// Step 7: Restore blockquote markers
+	// Step 7: Restore code blocks
+	// This happens AFTER all LaTeX conversions to preserve code content
+	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
+		return codeBlocks[parseInt(index)];
+	});
+
+	// Step 8: Restore blockquote markers
 	if (blockquoteMarkers.size > 0) {
 		const finalLines = content.split('\n');
 		const restoredLines = finalLines.map((line, index) => {
diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt
index 369502d7ae..e90e8e2d1b 100644
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -9,6 +9,10 @@ if (NOT MSVC)
 endif()
 
 target_link_libraries  (${TARGET} PRIVATE Threads::Threads)
+
+if (WIN32 AND NOT MSVC)
+    target_link_libraries(${TARGET} PUBLIC ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 target_compile_definitions(${TARGET} PRIVATE