From d98b548120eecf98f0f6eaa1ba7e29b3afda9f2e Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Wed, 14 Jan 2026 20:29:35 +0100 Subject: [PATCH] Restore clip's cb() to its rightful glory - extract common debugging elements in llama (#17914) * Extract common debugging functions; plug eval-callback and mtmd's MTMD_DEBUG_GRAPH with same functionality * Move to common * Remove unneeded header * Unlink from common * chore: update webui build output * Cleanup; properly pass params to mtmd without depending on common; factorize debug.cpp to use common debug code. * Revert change to webapp * Post-merge adjust * Apply suggestions from code review Co-authored-by: Xuan-Son Nguyen * Apply code review changes * Remove changes to server-context * Remove mtmd.h include * Remove utility functions from header * Apply suggestions from code review Co-authored-by: Xuan-Son Nguyen * Rename functions * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen --------- Co-authored-by: Xuan-Son Nguyen --- common/CMakeLists.txt | 2 + common/debug.cpp | 165 ++++++++++++++++++ common/debug.h | 43 +++++ docs/backend/hexagon/CMakeUserPresets.json | 2 +- examples/debug/debug.cpp | 192 +-------------------- examples/eval-callback/eval-callback.cpp | 161 +---------------- tools/mtmd/clip-graph.h | 4 - tools/mtmd/clip.cpp | 52 ++---- tools/mtmd/clip.h | 3 + tools/mtmd/mtmd-cli.cpp | 7 + tools/mtmd/mtmd.cpp | 4 + tools/mtmd/mtmd.h | 20 ++- 12 files changed, 259 insertions(+), 396 deletions(-) create mode 100644 common/debug.cpp create mode 100644 common/debug.h diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3451a311d0..723973ed70 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -60,6 +60,8 @@ add_library(${TARGET} STATIC common.h console.cpp console.h + debug.cpp + debug.h download.cpp download.h http.h diff --git a/common/debug.cpp b/common/debug.cpp new file mode 100644 index 0000000000..fdaddb1443 --- /dev/null +++ b/common/debug.cpp @@ -0,0 +1,165 @@ +#include "debug.h" + +#include "log.h" + +#include +#include + +static std::string common_ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static float common_ggml_get_float_value(const uint8_t * data, + ggml_type type, + const size_t * nb, + size_t i0, + size_t i1, + size_t i2, + size_t i3) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(const float *) &data[i]; + } else if (type == GGML_TYPE_I64) { + v = (float) *(const int64_t *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(const int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(const int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(const int8_t *) &data[i]; + } else if (type == GGML_TYPE_BF16) { + v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); + } else { + GGML_ABORT("fatal error"); + } + return v; +} + +template +void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + GGML_ASSERT(n > 0); + float sum = 0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3); + sum += v; + } + } + } + } + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + LOG_ERR(" [\n"); + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2 * n) { + LOG_ERR(" ..., \n"); + i2 = ne[2] - n; + } + LOG_ERR(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2 * n) { + LOG_ERR(" ..., \n"); + i1 = ne[1] - n; + } + LOG_ERR(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2 * n) { + LOG_ERR("..., "); + i0 = ne[0] - n; + } + const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3); + LOG_ERR("%12.4f", v); + if (i0 < ne[0] - 1) { + LOG_ERR(", "); + } + } + LOG_ERR("],\n"); + } + LOG_ERR(" ],\n"); + } + LOG_ERR(" ]\n"); + LOG_ERR(" sum = %f\n", sum); + } + + if constexpr (abort) { + if (std::isnan(sum)) { + LOG_ERR("encountered NaN - aborting\n"); + exit(0); + } + } +} + +/** + * GGML operations callback during the graph execution. + * + * @param t current tensor + * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor + * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. + * see ggml_backend_sched_eval_callback + * @param user_data user data to pass at each call back + * @return true to receive data or continue the graph, false otherwise + */ +template bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (base_callback_data *) user_data; + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + + bool matches_filter = cb_data->tensor_filters.empty(); + + if (!matches_filter) { + for (const auto & filter : cb_data->tensor_filters) { + if (std::regex_search(t->name, filter)) { + matches_filter = true; + break; + } + } + } + + char src1_str[128] = { 0 }; + if (src1) { + snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); + } + + if (matches_filter) { + LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type), + ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "", + common_ggml_ne_string(t).c_str()); + } + + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (!ggml_is_quantized(t->type) && matches_filter) { + uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + common_debug_print_tensor(data, t->type, t->ne, t->nb, 3); + } + + return true; +} + +// Explicit template instantiations +template bool common_debug_cb_eval(ggml_tensor *, bool, void *); +template bool common_debug_cb_eval(ggml_tensor *, bool, void *); +template void common_debug_print_tensor(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t); +template void common_debug_print_tensor(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t); diff --git a/common/debug.h b/common/debug.h new file mode 100644 index 0000000000..0c55963258 --- /dev/null +++ b/common/debug.h @@ -0,0 +1,43 @@ +#pragma once +#include "common.h" +#include +#include +#include + +// common debug functions and structs + +// Print a tensor's detailed data +// data - the tensor's data in byte format +// type - the tensor's quantization type +// ne - the tensor dimensions array +// nb - the tensor strides array +// n - the number of rows/columns to fully print +template void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n); + +// Intended to use as callback for ggml_backend_sched_eval_callback +// prints tensors that are processed in the computation graph +// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with +// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns +// The template parameter determins whether an error should be thrown whenever a NaN is encountered +// in a tensor (useful for stopping debug sessions on first erroneous tensor) +// The callback data will be passed as the third parameter (user_data) +template bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data); +struct base_callback_data { + std::vector data; + std::vector tensor_filters; + + base_callback_data() = default; + + base_callback_data(common_params & params, const std::vector & filter_patterns) { + for (const auto & pattern : filter_patterns) { + try { + std::string anchored_pattern = "^" + pattern; + tensor_filters.emplace_back(anchored_pattern, std::regex::optimize); + } catch (const std::regex_error & e) { + throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what()); + } + } + params.cb_eval = common_debug_cb_eval; + params.cb_eval_user_data = this; + } +}; diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json index a1d99018b1..1f2676c0bc 100644 --- a/docs/backend/hexagon/CMakeUserPresets.json +++ b/docs/backend/hexagon/CMakeUserPresets.json @@ -1,4 +1,4 @@ -{ +{ "version": 4, "configurePresets": [ { diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index 63be40c842..88947acbd3 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -1,11 +1,9 @@ +#include "debug.h" #include "arg.h" #include "common.h" #include "log.h" #include "llama.h" -#include "ggml.h" -#include -#include #include #include #include @@ -13,7 +11,7 @@ #include #include -static void print_usage(int, char ** argv) { +static void print_usage(int /*argc*/, char ** argv) { const std::string usage_template = R"( example usage: @@ -35,28 +33,6 @@ static void print_usage(int, char ** argv) { LOG("%s\n", usage.c_str()); } -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data); - -struct callback_data { - std::vector data; - std::vector tensor_filters; - - callback_data() = default; - - callback_data(common_params & params, const std::vector & filter_patterns) { - for (const auto & pattern : filter_patterns) { - try { - std::string anchored_pattern = "^" + pattern; - tensor_filters.emplace_back(anchored_pattern, std::regex::optimize); - } catch (const std::regex_error & e) { - throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what()); - } - } - params.cb_eval = ggml_debug; - params.cb_eval_user_data = this; - } -}; - static bool has_pooling(llama_context * ctx) { switch (llama_pooling_type(ctx)) { case LLAMA_POOLING_TYPE_NONE: @@ -120,168 +96,6 @@ struct output_data { } }; -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -static float ggml_get_float_value(const uint8_t * data, ggml_type type, - const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - switch (type) { - case GGML_TYPE_F16: - return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); - case GGML_TYPE_F32: - return *(const float *) &data[i]; - case GGML_TYPE_I64: - return (float) *(const int64_t *) &data[i]; - case GGML_TYPE_I32: - return (float) *(const int32_t *) &data[i]; - case GGML_TYPE_I16: - return (float) *(const int16_t *) &data[i]; - case GGML_TYPE_I8: - return (float) *(const int8_t *) &data[i]; - case GGML_TYPE_BF16: - return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); - default: - GGML_ABORT("fatal error"); - } -} - -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - float sum_sq = 0.0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - sum += v; - sum_sq += v * v; - } - } - } - } - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG_DBG(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - LOG_DBG(" ..., \n"); - i2 = ne[2] - n; - } - LOG_DBG(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - LOG_DBG(" ..., \n"); - i1 = ne[1] - n; - } - LOG_DBG(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - LOG_DBG("..., "); - i0 = ne[0] - n; - } - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG_DBG("%12.4f", v); - if (i0 < ne[0] - 1) { - LOG_DBG(", "); - } - } - LOG_DBG("],\n"); - } - LOG_DBG(" ],\n"); - } - LOG_DBG(" ]\n"); - LOG_DBG(" sum = %f\n", sum); - LOG_DBG(" sum_sq = %f\n", sum_sq); - } - - if (std::isnan(sum)) { - LOG_ERR("encountered NaN - aborting\n"); - exit(0); - } -} - -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - - bool matches_filter = cb_data->tensor_filters.empty(); - - if (!matches_filter) { - for (const auto & filter : cb_data->tensor_filters) { - if (std::regex_search(t->name, filter)) { - matches_filter = true; - break; - } - } - } - - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - if (matches_filter) { - LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, - ggml_type_name(t->type), - ggml_op_desc(t), - src0->name, - ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - } - - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); - } - - if (!ggml_is_quantized(t->type) && matches_filter) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); - } - - return true; -} - - static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) { std::filesystem::create_directory(output_dir); auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix); @@ -408,7 +222,7 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - callback_data cb_data(params, params.tensor_filter); + base_callback_data cb_data(params, params.tensor_filter); auto llama_init = common_init_from_params(params); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 408338f1af..bd58734979 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,165 +1,12 @@ #include "arg.h" #include "common.h" +#include "debug.h" #include "log.h" #include "llama.h" -#include "ggml.h" - -#include -#include +#include "llama-cpp.h" #include #include -/** - * This the arbitrary data which will be passed to each callback. - * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. - */ -struct callback_data { - std::vector data; -}; - -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; - if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); - } else if (type == GGML_TYPE_F32) { - v = *(const float *) &data[i]; - } else if (type == GGML_TYPE_I64) { - v = (float) *(const int64_t *) &data[i]; - } else if (type == GGML_TYPE_I32) { - v = (float) *(const int32_t *) &data[i]; - } else if (type == GGML_TYPE_I16) { - v = (float) *(const int16_t *) &data[i]; - } else if (type == GGML_TYPE_I8) { - v = (float) *(const int8_t *) &data[i]; - } else if (type == GGML_TYPE_BF16) { - v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); - } else { - GGML_ABORT("fatal error"); - } - return v; -} - -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - sum += v; - } - } - } - } - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - LOG(" ..., \n"); - i2 = ne[2] - n; - } - LOG(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - LOG(" ..., \n"); - i1 = ne[1] - n; - } - LOG(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - LOG("..., "); - i0 = ne[0] - n; - } - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG("%12.4f", v); - if (i0 < ne[0] - 1) LOG(", "); - } - LOG("],\n"); - } - LOG(" ],\n"); - } - LOG(" ]\n"); - LOG(" sum = %f\n", sum); - } - - // TODO: make this abort configurable/optional? - if (std::isnan(sum)) { - LOG_ERR("encountered NaN - aborting\n"); - exit(0); - } -} - -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); - } - - if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); - } - - return true; -} - static bool run(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -182,7 +29,7 @@ static bool run(llama_context * ctx, const common_params & params) { } int main(int argc, char ** argv) { - callback_data cb_data; + base_callback_data cb_data; common_params params; @@ -197,7 +44,7 @@ int main(int argc, char ** argv) { // pass the callback to the backend scheduler // it will be executed for each node during the graph computation - params.cb_eval = ggml_debug; + params.cb_eval = common_debug_cb_eval; params.cb_eval_user_data = &cb_data; params.warmup = false; diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 2b1915779f..4c7f7504cf 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -32,10 +32,6 @@ struct clip_graph { const float kq_scale; const clip_flash_attn_type flash_attn_type; - // for debugging - const bool debug_graph; - std::vector & debug_print_tensors; - ggml_context_ptr ctx0_ptr; ggml_context * ctx0; ggml_cgraph * gf; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fd2fb07fd2..9b076e0c56 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -152,18 +152,14 @@ struct clip_ctx { ggml_backend_t backend_cpu = nullptr; ggml_backend_buffer_ptr buf; + int max_nodes = 8192; ggml_backend_sched_ptr sched; clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; bool is_allocated = false; - // for debugging - bool debug_graph = false; - std::vector debug_print_tensors; - clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; - debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr; backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (!backend_cpu) { throw std::runtime_error("failed to initialize CPU backend"); @@ -204,6 +200,10 @@ struct clip_ctx { sched.reset( ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) ); + + if (ctx_params.cb_eval != nullptr) { + ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data); + } } ~clip_ctx() { @@ -239,9 +239,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : n_mmproj_embd(clip_n_mmproj_embd(ctx)), eps(hparams.eps), kq_scale(1.0f / sqrtf((float)d_head)), - flash_attn_type(ctx->flash_attn_type), - debug_graph(ctx->debug_graph), - debug_print_tensors(ctx->debug_print_tensors) { + flash_attn_type(ctx->flash_attn_type) { struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), /*.mem_buffer =*/ ctx->buf_compute_meta.data(), @@ -252,14 +250,11 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); } -void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { - if (debug_graph) { - ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); - std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; - ggml_set_name(cur, cur_name.c_str()); - ggml_set_output(cur); - ggml_build_forward_expand(gf, cur); - debug_print_tensors.push_back(cur); +void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); } } @@ -1519,8 +1514,8 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); - model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight")); - model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI)); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: @@ -1761,8 +1756,8 @@ struct clip_model_loader { model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); - model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight")); - model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_EOI)); } break; case PROJECTOR_TYPE_LLAMA4: { @@ -3339,7 +3334,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph - ctx->debug_print_tensors.clear(); ggml_backend_sched_reset(ctx->sched.get()); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); @@ -3709,18 +3703,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - // print debug nodes - if (ctx->debug_graph) { - LOG_INF("\n\n---\n\n"); - LOG_INF("\n\nDebug graph:\n\n"); - for (ggml_tensor * t : ctx->debug_print_tensors) { - std::vector data(ggml_nbytes(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - print_tensor_shape(t); - print_tensor_data(t, data.data(), 3); - } - } - // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); @@ -3872,7 +3854,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { // // API for debugging // - void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { clip_image_f32 img; img.nx = w; @@ -3881,9 +3862,6 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { for (int i = 0; i < h * w * 3; i++) { img.buf[i] = static_cast(fill_value); } - bool cur_debug_graph = ctx->debug_graph; - ctx->debug_graph = true; clip_image_encode(ctx, 1, &img, nullptr); - ctx->debug_graph = cur_debug_graph; GGML_ASSERT(img.buf.empty() && "expected, always stop here"); } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 27ee020182..71b58484d6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "mtmd.h" #include #include @@ -37,6 +38,8 @@ struct clip_context_params { int image_min_tokens; int image_max_tokens; bool warmup; + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; struct clip_init_result { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 1ba02a5233..054c7faa6a 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -1,4 +1,5 @@ #include "arg.h" +#include "debug.h" #include "log.h" #include "common.h" #include "sampling.h" @@ -88,6 +89,8 @@ struct mtmd_cli_context { int n_threads = 1; llama_pos n_past = 0; + base_callback_data cb_data; + mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init->model(); lctx = llama_init->context(); @@ -139,6 +142,10 @@ struct mtmd_cli_context { mparams.warmup = params.warmup; mparams.image_min_tokens = params.image_min_tokens; mparams.image_max_tokens = params.image_max_tokens; + if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) { + mparams.cb_eval_user_data = &cb_data; + mparams.cb_eval = common_debug_cb_eval; + } ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f25706987e..32a24bfcea 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -111,6 +111,8 @@ mtmd_context_params mtmd_context_params_default() { /* warmup */ true, /* image_min_tokens */ -1, /* image_max_tokens */ -1, + /* cb_eval */ nullptr, + /* cb_eval_user_data */ nullptr, }; return params; } @@ -176,6 +178,8 @@ struct mtmd_context { /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, /* warmup */ ctx_params.warmup, + /* cb_eval */ ctx_params.cb_eval, + /* cb_eval_user_data */ ctx_params.cb_eval_user_data, }; auto res = clip_init(mmproj_fname, ctx_clip_params); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 44d05ceaee..a12c28ef22 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -95,6 +95,10 @@ struct mtmd_context_params { // limit number of image tokens, only for vision models with dynamic resolution int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) + + // callback function passed over to mtmd proper + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; MTMD_API const char * mtmd_default_marker(void); @@ -273,12 +277,12 @@ struct bitmap { ptr.reset(mtmd_bitmap_init(nx, ny, data)); } ~bitmap() = default; - uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); } - uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); } - const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); } - size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); } - std::string id() { return mtmd_bitmap_get_id(ptr.get()); } - void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); } + uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } + uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } + size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } + std::string id() const { return mtmd_bitmap_get_id(ptr.get()); } + void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); } }; struct bitmaps { @@ -302,8 +306,8 @@ struct input_chunks { input_chunks() = default; input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} ~input_chunks() = default; - size_t size() { return mtmd_input_chunks_size(ptr.get()); } - const mtmd_input_chunk * operator[](size_t idx) { + size_t size() const { return mtmd_input_chunks_size(ptr.get()); } + const mtmd_input_chunk * operator[](size_t idx) const { return mtmd_input_chunks_get(ptr.get(), idx); } };