clip : print more detailed op support info during warmup

This commit is contained in:
Georgi Gerganov 2025-11-02 10:13:48 +02:00
parent b4955f0ae6
commit bdb43f6e9c
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 108 additions and 54 deletions

View File

@ -4,10 +4,8 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "clip-impl.h"
#include "mtmd.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
@ -18,15 +16,12 @@
#include <cstring>
#include <fstream>
#include <map>
#include <regex>
#include <stdexcept>
#include <unordered_set>
#include <vector>
#include <sstream>
#include <cinttypes>
#include <limits>
#include <array>
#include <numeric>
#include <functional>
// TODO: allow to pass callback from user code
@ -428,7 +423,7 @@ struct clip_ctx {
int max_nodes = 8192;
ggml_backend_sched_ptr sched;
llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
// for debugging
bool debug_graph = false;
@ -2266,7 +2261,7 @@ private:
ggml_tensor * cur;
if (ctx->flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) {
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
k = ggml_cast(ctx0, k, GGML_TYPE_F16);
@ -3204,30 +3199,58 @@ struct clip_model_loader {
}
}
void warmup(clip_ctx & ctx_clip) {
if (ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
struct support_info_op {
ggml_tensor * op;
// true if the op runs on the accelerated ctx_clip.backend
bool is_accel = true;
};
struct support_info_graph {
// whether the clip_ctx.backend supports flash attention
bool fattn = true;
std::vector<support_info_op> ops;
};
static void warmup(clip_ctx & ctx_clip) {
support_info_graph info;
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
// try to enable flash attention to see if it's supported
ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
bool supported = alloc_compute_meta(ctx_clip);
if (!supported) {
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
info = alloc_compute_meta(ctx_clip);
if (!info.fattn) {
LOG_WRN("%s: flash attention not supported, memory usage will increase\n", __func__);
// TODO: maybe log more details about why flash attention is not supported
ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
alloc_compute_meta(ctx_clip);
}
} else {
bool supported = alloc_compute_meta(ctx_clip);
if (!supported) {
info = alloc_compute_meta(ctx_clip);
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
}
}
LOG_INF("%s: flash attention is %s\n", __func__,
(ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
// print ops that are not supported by the GPU backend (if there is one)
if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
for (const auto & op : info.ops) {
if (!op.is_accel) {
LOG_WRN("%s: op %16s is not supported by the CLIP backend: type = %s, ne = [%d %d %d %d]\n", __func__,
ggml_op_name(op.op->op),
ggml_type_name(op.op->type),
op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
}
}
}
}
// return false if flash attention is not supported
bool alloc_compute_meta(clip_ctx & ctx_clip) {
// return false if at least one op is not supported by the backend
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
const auto & hparams = ctx_clip.model.hparams;
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
@ -3264,67 +3287,87 @@ struct clip_model_loader {
LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
// check flash attention support
support_info_graph res {
/*.fattn = */ true,
/*.ops = */ {},
};
// check op support
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * node = ggml_graph_node(gf, i);
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
return false;
res.ops.push_back({node, true});
if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
res.ops.back().is_accel = false;
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
res.fattn = false;
}
}
}
return true;
return res;
}
void get_bool(const std::string & key, bool & output, bool required = true) {
void get_bool(const std::string & key, bool & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_bool(ctx_gguf.get(), i);
}
void get_i32(const std::string & key, int & output, bool required = true) {
void get_i32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_i32(ctx_gguf.get(), i);
}
void get_u32(const std::string & key, int & output, bool required = true) {
void get_u32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_u32(ctx_gguf.get(), i);
}
void get_f32(const std::string & key, float & output, bool required = true) {
void get_f32(const std::string & key, float & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_f32(ctx_gguf.get(), i);
}
void get_string(const std::string & key, std::string & output, bool required = true) {
void get_string(const std::string & key, std::string & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
}
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
int n = gguf_get_arr_n(ctx_gguf.get(), i);
@ -3335,7 +3378,7 @@ struct clip_model_loader {
}
}
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
auto & hparams = model.hparams;
for (int x = 1; x <= max_patches_per_side; x++) {
for (int y = 1; y <= max_patches_per_side; y++) {
@ -3375,12 +3418,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
} catch (const std::exception & e) {
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
if (ctx_vision) {
delete ctx_vision;
}
if (ctx_audio) {
delete ctx_audio;
}
delete ctx_vision;
delete ctx_audio;
return {nullptr, nullptr};
}
@ -3418,10 +3459,10 @@ void clip_image_size_free(struct clip_image_size * load_image_size) {
}
delete load_image_size;
}
void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
return batch->entries.size();
@ -4539,6 +4580,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
if (ggml_backend_sched_get_n_splits(ctx->sched.get()) > 1) {
LOG_WRN("%s: *****************************************************************\n", __func__);
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
LOG_WRN("%s: use GGML_SCHED_DEBUG=2 to determine which ops \n", __func__);
LOG_WRN("%s: the performance will be suboptimal \n", __func__);
LOG_WRN("%s: \n", __func__);
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);

View File

@ -1,7 +1,7 @@
#pragma once
#include "ggml.h"
#include "mtmd.h"
#include <stddef.h>
#include <stdint.h>
@ -23,10 +23,16 @@ enum clip_modality {
CLIP_MODALITY_AUDIO,
};
enum clip_flash_attn_type {
CLIP_FLASH_ATTN_TYPE_AUTO = -1,
CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
};
struct clip_context_params {
bool use_gpu;
enum ggml_log_level verbosity;
llama_flash_attn_type flash_attn_type;
enum clip_flash_attn_type flash_attn_type;
};
struct clip_init_result {

View File

@ -19,7 +19,6 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
// represents raw image data, layout is RGBRGBRGB...
@ -92,6 +91,15 @@ const char * mtmd_default_marker() {
return "<__media__>";
}
static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
switch (flash_attn_type) {
case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
}
return CLIP_FLASH_ATTN_TYPE_AUTO;
}
mtmd_context_params mtmd_context_params_default() {
mtmd_context_params params;
params.use_gpu = true;
@ -165,7 +173,7 @@ struct mtmd_context {
clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
ctx_clip_params.flash_attn_type = ctx_params.flash_attn_type;
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v;
ctx_a = res.ctx_a;
@ -380,9 +388,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
}
void mtmd_free(mtmd_context * ctx) {
if (ctx) {
delete ctx;
}
delete ctx;
}
struct mtmd_tokenizer {

View File

@ -82,7 +82,7 @@ struct mtmd_context_params {
enum ggml_log_level verbosity;
const char * image_marker; // deprecated, use media_marker instead
const char * media_marker;
llama_flash_attn_type flash_attn_type;
enum llama_flash_attn_type flash_attn_type;
};
MTMD_API const char * mtmd_default_marker(void);