clip : print more detailed op support info during warmup

2025-11-02 10:13:48 +02:00 · 2025-11-02 10:13:48 +02:00 · bdb43f6e9c
parent b4955f0ae6
commit bdb43f6e9c
4 changed files with 108 additions and 54 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -4,10 +4,8 @@
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "clip-impl.h"
-#include "mtmd.h"
 #include "ggml.h"
 #include "ggml-cpp.h"
-#include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
@ -18,15 +16,12 @@
 #include <cstring>
 #include <fstream>
 #include <map>
-#include <regex>
 #include <stdexcept>
 #include <unordered_set>
 #include <vector>
-#include <sstream>
 #include <cinttypes>
 #include <limits>
 #include <array>
-#include <numeric>
 #include <functional>

 // TODO: allow to pass callback from user code
@ -428,7 +423,7 @@ struct clip_ctx {

    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
-    llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;

    // for debugging
    bool debug_graph = false;
@ -2266,7 +2261,7 @@ private:

        ggml_tensor * cur;

-        if (ctx->flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) {
+        if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
            ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);

            k = ggml_cast(ctx0, k, GGML_TYPE_F16);
@ -3204,30 +3199,58 @@ struct clip_model_loader {
        }
    }

-    void warmup(clip_ctx & ctx_clip) {
-        if (ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
+    struct support_info_op {
+        ggml_tensor * op;
+
+        // true if the op runs on the accelerated ctx_clip.backend
+        bool is_accel = true;
+    };
+
+    struct support_info_graph {
+        // whether the clip_ctx.backend supports flash attention
+        bool fattn = true;
+
+        std::vector<support_info_op> ops;
+    };
+
+    static void warmup(clip_ctx & ctx_clip) {
+        support_info_graph info;
+
+        if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
            // try to enable flash attention to see if it's supported
-            ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
-            bool supported = alloc_compute_meta(ctx_clip);
-            if (!supported) {
+            ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
+            info = alloc_compute_meta(ctx_clip);
+            if (!info.fattn) {
                LOG_WRN("%s: flash attention not supported, memory usage will increase\n", __func__);
                // TODO: maybe log more details about why flash attention is not supported
-                ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+                ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
                alloc_compute_meta(ctx_clip);
            }
        } else {
-            bool supported = alloc_compute_meta(ctx_clip);
-            if (!supported) {
+            info = alloc_compute_meta(ctx_clip);
+            if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
                LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
            }
        }

        LOG_INF("%s: flash attention is %s\n", __func__,
-            (ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+            (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+
+        // print ops that are not supported by the GPU backend (if there is one)
+        if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
+            for (const auto & op : info.ops) {
+                if (!op.is_accel) {
+                    LOG_WRN("%s: op %16s is not supported by the CLIP backend: type = %s, ne = [%d %d %d %d]\n", __func__,
+                            ggml_op_name(op.op->op),
+                            ggml_type_name(op.op->type),
+                            op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
+                }
+            }
+        }
    }

-    // return false if flash attention is not supported
-    bool alloc_compute_meta(clip_ctx & ctx_clip) {
+    // return false if at least one op is not supported by the backend
+    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
        const auto & hparams = ctx_clip.model.hparams;
        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());

@ -3264,67 +3287,87 @@ struct clip_model_loader {

        LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__,  n_splits, n_nodes);

-        // check flash attention support
+        support_info_graph res {
+            /*.fattn = */ true,
+            /*.ops   = */ {},
+        };
+
+        // check op support
        for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
            ggml_tensor * node = ggml_graph_node(gf, i);
-            if (node->op == GGML_OP_FLASH_ATTN_EXT) {
-                if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
-                    return false;
+            res.ops.push_back({node, true});
+            if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
+                res.ops.back().is_accel = false;
+                if (node->op == GGML_OP_FLASH_ATTN_EXT) {
+                    res.fattn = false;
                }
            }
        }
-        return true;
+
+        return res;
    }

-    void get_bool(const std::string & key, bool & output, bool required = true) {
+    void get_bool(const std::string & key, bool & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        output = gguf_get_val_bool(ctx_gguf.get(), i);
    }

-    void get_i32(const std::string & key, int & output, bool required = true) {
+    void get_i32(const std::string & key, int & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        output = gguf_get_val_i32(ctx_gguf.get(), i);
    }

-    void get_u32(const std::string & key, int & output, bool required = true) {
+    void get_u32(const std::string & key, int & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        output = gguf_get_val_u32(ctx_gguf.get(), i);
    }

-    void get_f32(const std::string & key, float & output, bool required = true) {
+    void get_f32(const std::string & key, float & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        output = gguf_get_val_f32(ctx_gguf.get(), i);
    }

-    void get_string(const std::string & key, std::string & output, bool required = true) {
+    void get_string(const std::string & key, std::string & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
    }

-    void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
+    void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
        if (i < 0) {
-            if (required) throw std::runtime_error("Key not found: " + key);
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
            return;
        }
        int n = gguf_get_arr_n(ctx_gguf.get(), i);
@ -3335,7 +3378,7 @@ struct clip_model_loader {
        }
    }

-    void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
+    static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
        auto & hparams = model.hparams;
        for (int x = 1; x <= max_patches_per_side; x++) {
            for (int y = 1; y <= max_patches_per_side; y++) {
@ -3375,12 +3418,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    } catch (const std::exception & e) {
        LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
-        if (ctx_vision) {
-            delete ctx_vision;
-        }
-        if (ctx_audio) {
-            delete ctx_audio;
-        }
+
+        delete ctx_vision;
+        delete ctx_audio;
+
        return {nullptr, nullptr};
    }

@ -3418,10 +3459,10 @@ void clip_image_size_free(struct clip_image_size * load_image_size) {
    }
    delete load_image_size;
 }
-void clip_image_u8_free(struct clip_image_u8  * img) { if (img) delete img; }
-void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
-void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
+void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }

 size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
    return batch->entries.size();
@ -4539,6 +4580,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (ggml_backend_sched_get_n_splits(ctx->sched.get()) > 1) {
        LOG_WRN("%s: *****************************************************************\n", __func__);
        LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
+        LOG_WRN("%s:          use GGML_SCHED_DEBUG=2 to determine which ops           \n", __func__);
        LOG_WRN("%s:          the performance will be suboptimal                      \n", __func__);
        LOG_WRN("%s:                                                                  \n", __func__);
        LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -1,7 +1,7 @@
 #pragma once

 #include "ggml.h"
-#include "mtmd.h"
+
 #include <stddef.h>
 #include <stdint.h>

@ -23,10 +23,16 @@ enum clip_modality {
    CLIP_MODALITY_AUDIO,
 };

+enum clip_flash_attn_type {
+    CLIP_FLASH_ATTN_TYPE_AUTO     = -1,
+    CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
+    CLIP_FLASH_ATTN_TYPE_ENABLED  = 1,
+};
+
 struct clip_context_params {
    bool use_gpu;
    enum ggml_log_level verbosity;
-    llama_flash_attn_type flash_attn_type;
+    enum clip_flash_attn_type flash_attn_type;
 };

 struct clip_init_result {
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -19,7 +19,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <limits>
 #include <vector>

 // represents raw image data, layout is RGBRGBRGB...
@ -92,6 +91,15 @@ const char * mtmd_default_marker() {
    return "<__media__>";
 }

+static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
+    switch (flash_attn_type) {
+        case LLAMA_FLASH_ATTN_TYPE_AUTO:     return CLIP_FLASH_ATTN_TYPE_AUTO;
+        case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
+        case LLAMA_FLASH_ATTN_TYPE_ENABLED:  return CLIP_FLASH_ATTN_TYPE_ENABLED;
+    }
+    return CLIP_FLASH_ATTN_TYPE_AUTO;
+}
+
 mtmd_context_params mtmd_context_params_default() {
    mtmd_context_params params;
    params.use_gpu = true;
@ -165,7 +173,7 @@ struct mtmd_context {
        clip_context_params ctx_clip_params;
        ctx_clip_params.use_gpu   = ctx_params.use_gpu;
        ctx_clip_params.verbosity = ctx_params.verbosity;
-        ctx_clip_params.flash_attn_type = ctx_params.flash_attn_type;
+        ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
        auto res = clip_init(mmproj_fname, ctx_clip_params);
        ctx_v = res.ctx_v;
        ctx_a = res.ctx_a;
@ -380,9 +388,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 }

 void mtmd_free(mtmd_context * ctx) {
-    if (ctx) {
-        delete ctx;
-    }
+    delete ctx;
 }

 struct mtmd_tokenizer {
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -82,7 +82,7 @@ struct mtmd_context_params {
    enum ggml_log_level verbosity;
    const char * image_marker; // deprecated, use media_marker instead
    const char * media_marker;
-    llama_flash_attn_type flash_attn_type;
+    enum llama_flash_attn_type flash_attn_type;
 };

 MTMD_API const char * mtmd_default_marker(void);