clip : print more detailed op support info during warmup

This commit is contained in:
Georgi Gerganov 2025-11-02 10:13:48 +02:00
parent b4955f0ae6
commit bdb43f6e9c
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 108 additions and 54 deletions

View File

@ -4,10 +4,8 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "clip-impl.h" #include "clip-impl.h"
#include "mtmd.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-cpp.h" #include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "gguf.h" #include "gguf.h"
@ -18,15 +16,12 @@
#include <cstring> #include <cstring>
#include <fstream> #include <fstream>
#include <map> #include <map>
#include <regex>
#include <stdexcept> #include <stdexcept>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include <sstream>
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#include <array> #include <array>
#include <numeric>
#include <functional> #include <functional>
// TODO: allow to pass callback from user code // TODO: allow to pass callback from user code
@ -428,7 +423,7 @@ struct clip_ctx {
int max_nodes = 8192; int max_nodes = 8192;
ggml_backend_sched_ptr sched; ggml_backend_sched_ptr sched;
llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
// for debugging // for debugging
bool debug_graph = false; bool debug_graph = false;
@ -2266,7 +2261,7 @@ private:
ggml_tensor * cur; ggml_tensor * cur;
if (ctx->flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) { if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
k = ggml_cast(ctx0, k, GGML_TYPE_F16); k = ggml_cast(ctx0, k, GGML_TYPE_F16);
@ -3204,30 +3199,58 @@ struct clip_model_loader {
} }
} }
void warmup(clip_ctx & ctx_clip) { struct support_info_op {
if (ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) { ggml_tensor * op;
// true if the op runs on the accelerated ctx_clip.backend
bool is_accel = true;
};
struct support_info_graph {
// whether the clip_ctx.backend supports flash attention
bool fattn = true;
std::vector<support_info_op> ops;
};
static void warmup(clip_ctx & ctx_clip) {
support_info_graph info;
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
// try to enable flash attention to see if it's supported // try to enable flash attention to see if it's supported
ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED; ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
bool supported = alloc_compute_meta(ctx_clip); info = alloc_compute_meta(ctx_clip);
if (!supported) { if (!info.fattn) {
LOG_WRN("%s: flash attention not supported, memory usage will increase\n", __func__); LOG_WRN("%s: flash attention not supported, memory usage will increase\n", __func__);
// TODO: maybe log more details about why flash attention is not supported // TODO: maybe log more details about why flash attention is not supported
ctx_clip.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
alloc_compute_meta(ctx_clip); alloc_compute_meta(ctx_clip);
} }
} else { } else {
bool supported = alloc_compute_meta(ctx_clip); info = alloc_compute_meta(ctx_clip);
if (!supported) { if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__); LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
} }
} }
LOG_INF("%s: flash attention is %s\n", __func__, LOG_INF("%s: flash attention is %s\n", __func__,
(ctx_clip.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled"); (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
// print ops that are not supported by the GPU backend (if there is one)
if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
for (const auto & op : info.ops) {
if (!op.is_accel) {
LOG_WRN("%s: op %16s is not supported by the CLIP backend: type = %s, ne = [%d %d %d %d]\n", __func__,
ggml_op_name(op.op->op),
ggml_type_name(op.op->type),
op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
}
}
}
} }
// return false if flash attention is not supported // return false if at least one op is not supported by the backend
bool alloc_compute_meta(clip_ctx & ctx_clip) { static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
const auto & hparams = ctx_clip.model.hparams; const auto & hparams = ctx_clip.model.hparams;
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
@ -3264,67 +3287,87 @@ struct clip_model_loader {
LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes); LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
// check flash attention support support_info_graph res {
/*.fattn = */ true,
/*.ops = */ {},
};
// check op support
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * node = ggml_graph_node(gf, i); ggml_tensor * node = ggml_graph_node(gf, i);
if (node->op == GGML_OP_FLASH_ATTN_EXT) { res.ops.push_back({node, true});
if (!ggml_backend_supports_op(ctx_clip.backend, node)) { if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
return false; res.ops.back().is_accel = false;
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
res.fattn = false;
} }
} }
} }
return true;
}
void get_bool(const std::string & key, bool & output, bool required = true) { return res;
}
void get_bool(const std::string & key, bool & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
output = gguf_get_val_bool(ctx_gguf.get(), i); output = gguf_get_val_bool(ctx_gguf.get(), i);
} }
void get_i32(const std::string & key, int & output, bool required = true) { void get_i32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
output = gguf_get_val_i32(ctx_gguf.get(), i); output = gguf_get_val_i32(ctx_gguf.get(), i);
} }
void get_u32(const std::string & key, int & output, bool required = true) { void get_u32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
output = gguf_get_val_u32(ctx_gguf.get(), i); output = gguf_get_val_u32(ctx_gguf.get(), i);
} }
void get_f32(const std::string & key, float & output, bool required = true) { void get_f32(const std::string & key, float & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
output = gguf_get_val_f32(ctx_gguf.get(), i); output = gguf_get_val_f32(ctx_gguf.get(), i);
} }
void get_string(const std::string & key, std::string & output, bool required = true) { void get_string(const std::string & key, std::string & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
output = std::string(gguf_get_val_str(ctx_gguf.get(), i)); output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
} }
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) { void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) { if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key); if (required) {
throw std::runtime_error("Key not found: " + key);
}
return; return;
} }
int n = gguf_get_arr_n(ctx_gguf.get(), i); int n = gguf_get_arr_n(ctx_gguf.get(), i);
@ -3335,7 +3378,7 @@ struct clip_model_loader {
} }
} }
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) { static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
auto & hparams = model.hparams; auto & hparams = model.hparams;
for (int x = 1; x <= max_patches_per_side; x++) { for (int x = 1; x <= max_patches_per_side; x++) {
for (int y = 1; y <= max_patches_per_side; y++) { for (int y = 1; y <= max_patches_per_side; y++) {
@ -3375,12 +3418,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
} catch (const std::exception & e) { } catch (const std::exception & e) {
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what()); LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
if (ctx_vision) {
delete ctx_vision; delete ctx_vision;
}
if (ctx_audio) {
delete ctx_audio; delete ctx_audio;
}
return {nullptr, nullptr}; return {nullptr, nullptr};
} }
@ -3418,10 +3459,10 @@ void clip_image_size_free(struct clip_image_size * load_image_size) {
} }
delete load_image_size; delete load_image_size;
} }
void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; } void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; } void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; } void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
return batch->entries.size(); return batch->entries.size();
@ -4539,6 +4580,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
if (ggml_backend_sched_get_n_splits(ctx->sched.get()) > 1) { if (ggml_backend_sched_get_n_splits(ctx->sched.get()) > 1) {
LOG_WRN("%s: *****************************************************************\n", __func__); LOG_WRN("%s: *****************************************************************\n", __func__);
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__); LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
LOG_WRN("%s: use GGML_SCHED_DEBUG=2 to determine which ops \n", __func__);
LOG_WRN("%s: the performance will be suboptimal \n", __func__); LOG_WRN("%s: the performance will be suboptimal \n", __func__);
LOG_WRN("%s: \n", __func__); LOG_WRN("%s: \n", __func__);
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__); LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);

View File

@ -1,7 +1,7 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#include "mtmd.h"
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
@ -23,10 +23,16 @@ enum clip_modality {
CLIP_MODALITY_AUDIO, CLIP_MODALITY_AUDIO,
}; };
enum clip_flash_attn_type {
CLIP_FLASH_ATTN_TYPE_AUTO = -1,
CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
};
struct clip_context_params { struct clip_context_params {
bool use_gpu; bool use_gpu;
enum ggml_log_level verbosity; enum ggml_log_level verbosity;
llama_flash_attn_type flash_attn_type; enum clip_flash_attn_type flash_attn_type;
}; };
struct clip_init_result { struct clip_init_result {

View File

@ -19,7 +19,6 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <limits>
#include <vector> #include <vector>
// represents raw image data, layout is RGBRGBRGB... // represents raw image data, layout is RGBRGBRGB...
@ -92,6 +91,15 @@ const char * mtmd_default_marker() {
return "<__media__>"; return "<__media__>";
} }
static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
switch (flash_attn_type) {
case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
}
return CLIP_FLASH_ATTN_TYPE_AUTO;
}
mtmd_context_params mtmd_context_params_default() { mtmd_context_params mtmd_context_params_default() {
mtmd_context_params params; mtmd_context_params params;
params.use_gpu = true; params.use_gpu = true;
@ -165,7 +173,7 @@ struct mtmd_context {
clip_context_params ctx_clip_params; clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity; ctx_clip_params.verbosity = ctx_params.verbosity;
ctx_clip_params.flash_attn_type = ctx_params.flash_attn_type; ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
auto res = clip_init(mmproj_fname, ctx_clip_params); auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v; ctx_v = res.ctx_v;
ctx_a = res.ctx_a; ctx_a = res.ctx_a;
@ -380,9 +388,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
} }
void mtmd_free(mtmd_context * ctx) { void mtmd_free(mtmd_context * ctx) {
if (ctx) {
delete ctx; delete ctx;
}
} }
struct mtmd_tokenizer { struct mtmd_tokenizer {

View File

@ -82,7 +82,7 @@ struct mtmd_context_params {
enum ggml_log_level verbosity; enum ggml_log_level verbosity;
const char * image_marker; // deprecated, use media_marker instead const char * image_marker; // deprecated, use media_marker instead
const char * media_marker; const char * media_marker;
llama_flash_attn_type flash_attn_type; enum llama_flash_attn_type flash_attn_type;
}; };
MTMD_API const char * mtmd_default_marker(void); MTMD_API const char * mtmd_default_marker(void);