From a488b495f7d8efccd8fc9ee84a2c7ca8c61428df Mon Sep 17 00:00:00 2001 From: bluebread Date: Sat, 29 Nov 2025 02:17:49 +0000 Subject: [PATCH] mtmd: SAM numerically works --- convert_hf_to_gguf.py | 12 +-- ggml/src/ggml.c | 1 + tools/mtmd/clip-impl.h | 182 ++++++++++++++++++++++++++++++++++++- tools/mtmd/clip.cpp | 197 ++++++++++++++++++++++------------------- 4 files changed, 297 insertions(+), 95 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 12baa198fe..2ef5430c61 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5839,12 +5839,14 @@ class DeepseekOCRVisionModel(MmprojModel): def tensor_force_quant(self, name, new_name, bid, n_dims): + # TODO: increase numercial stability. maybe delete later. + return gguf.GGMLQuantizationType.F32 # related to https://github.com/ggml-org/llama.cpp/issues/13025 - if "input_projection" in name: - return gguf.GGMLQuantizationType.F16 - if ".embeddings." in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) + # if "input_projection" in name: + # return gguf.GGMLQuantizationType.F16 + # if ".embeddings." in name: + # return gguf.GGMLQuantizationType.F32 + # return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Only process vision-related tensors, skip language model tensors diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8..837b1751e3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5081,6 +5081,7 @@ struct ggml_tensor * ggml_flash_attn_ext( GGML_ASSERT(q->ne[3] == v->ne[3]); if (mask) { + GGML_ASSERT(mask->type == GGML_TYPE_F16); GGML_ASSERT(ggml_is_contiguous(mask)); GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) && "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big"); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 63d5905566..c35d7a08d4 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -449,6 +450,33 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { // debugging // + +static std::string to_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static void print_tensor_info(ggml_tensor * t) { + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + char src1_str[128] = {0}; + if (src1) { + snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, to_ne_string(src1).c_str()); + } + + printf("%s: %s = %s(%s{%s}, %s)\n", + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, to_ne_string(src0).c_str(), + src1 ? src1_str : ""); +} + static void print_tensor_shape(ggml_tensor * t) { printf("%s.shape = [", t->name); for (int i = 0; i < ggml_n_dims(t); ++i) { @@ -460,12 +488,50 @@ static void print_tensor_shape(ggml_tensor * t) { printf("]\n"); } +static void print_tensor_sum(ggml_tensor * t, uint8_t * data, int64_t n) { + (void) n; // unused parameter + ggml_type type = t->type; + int64_t * ne = t->ne; + size_t * nb = t->nb; + double sum = 0.0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) &data[i]; + } else { + GGML_ABORT("fatal error"); + } + sum += v; + } + } + } + } + printf("%s.sum = %.6f\n", t->name, sum); +} + static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { ggml_type type = t->type; int64_t * ne = t->ne; size_t * nb = t->nb; + printf("%s.data: [\n", t->name); for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf("%s.data: [\n", t->name); + if (i3 == n && ne[3] > 2*n) { + printf(" ..., \n"); + i3 = ne[3] - n; + } + printf(" [\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2*n) { printf(" ..., \n"); @@ -507,6 +573,120 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { } printf(" ]\n"); } + printf(" ]\n"); +} + +static void save_tensor_to_file(const struct ggml_tensor * tensor) { + char filename[512]; + snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name); + + FILE * f = fopen(filename, "w"); + if (!f) { + fprintf(stderr, "Failed to open %s\n", filename); + return; + } + + // Check tensor size and warn if too large + int64_t total_elements = ggml_nelements(tensor); + fprintf(stderr, "Saving tensor %s (%lld elements) to %s\n", + tensor->name, (long long)total_elements, filename); + + if (total_elements > 10000000) { // 10M elements + fprintf(stderr, "Warning: tensor is very large (%lld elements), this may take time\n", + (long long)total_elements); + } + + uint8_t * data = (uint8_t *) tensor->data; + ggml_type type = tensor->type; + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + // Use a buffer to reduce I/O calls + const size_t BUF_SIZE = 8192; + char * buf = (char *) malloc(BUF_SIZE); + if (!buf) { + fprintf(stderr, "Failed to allocate buffer\n"); + fclose(f); + return; + } + size_t buf_pos = 0; + + // Helper lambda to flush buffer + auto flush_buf = [&]() { + if (buf_pos > 0) { + fwrite(buf, 1, buf_pos, f); + buf_pos = 0; + } + }; + + // Helper to append to buffer + auto append = [&](const char * str, size_t len) { + if (buf_pos + len >= BUF_SIZE) { + flush_buf(); + } + if (len >= BUF_SIZE) { + // String too large for buffer, write directly + fwrite(str, 1, len, f); + } else { + memcpy(buf + buf_pos, str, len); + buf_pos += len; + } + }; + + auto append_str = [&](const char * str) { + append(str, strlen(str)); + }; + + char num_buf[32]; + + // Write header once for all batches + append_str(tensor->name); + append_str(".data: [\n"); + + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + append_str(" [\n"); // Start of batch + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + append_str(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + append_str(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) &data[i]; + } else { + GGML_ABORT("fatal error"); + } + int len = snprintf(num_buf, sizeof(num_buf), "%8.4f", v); + append(num_buf, len); + if (i0 < ne[0] - 1) append_str(", "); + } + append_str("],\n"); + } + append_str(" ],\n"); + } + append_str(" ]"); // End of batch + if (i3 < ne[3] - 1) { + append_str(",\n"); // Comma between batches + } else { + append_str("\n"); + } + } + + append_str("]\n"); // Close the top-level array + + flush_buf(); + free(buf); + fclose(f); + fprintf(stderr, "Tensor saved successfully\n"); } // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 82d0b46a47..4b7a4a563f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -681,16 +681,19 @@ struct clip_graph { const auto tgt_size = inpL->ne[1]; const auto str_size = model.pos_embed->ne[1]; if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + // TODO: ggml_interpolate doesn't support bicubic model for CUDA backend ggml_tensor * new_pos_embed = ggml_interpolate( ctx0, - model.pos_embed, + old_pos_embed, tgt_size, tgt_size, enc_n_embd, 1, ggml_scale_mode::GGML_SCALE_MODE_BICUBIC ); - new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 2,1,0,3)); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); cur = ggml_add(ctx0, inpL, new_pos_embed); } else { cur = ggml_add(ctx0, inpL, model.pos_embed); @@ -699,10 +702,10 @@ struct clip_graph { // loop over layers for (int il = 0; il < _depth; il++) { auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; // layernorm1 cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "enc_layer_inp_normed", il); const int64_t w0 = cur->ne[1]; const int64_t h0 = cur->ne[2]; @@ -711,7 +714,7 @@ struct clip_graph { // local attention layer - apply window partition // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L169-L172 //cur = ggml_win_part(ctx0, cur, 14); - cur = window_partition(ctx0, cur, 14); + cur = window_partition(ctx0, cur, 14); // TODO: make this configurable } const int64_t W = cur->ne[1]; @@ -719,110 +722,93 @@ struct clip_graph { // self-attention { + const int B = cur->ne[3]; + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); cur = ggml_add(ctx0, cur, layer.qkv_b); - const int B = cur->ne[3]; + cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape + cur = ggml_reshape_4d(ctx0, cur, enc_n_embd, 3, W*H, B); - cur = ggml_reshape_4d(ctx0, cur, enc_n_embd, 3, W * H, B); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 3, 1, 2)); + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; - ggml_tensor * Qcur = - ggml_view_3d(ctx0, cur, enc_n_embd, W * H, B, cur->nb[1], cur->nb[2], 0); - Qcur = ggml_reshape_4d(ctx0, Qcur, enc_d_heads, enc_n_heads, W * H, B); - Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); - Qcur = ggml_reshape_3d(ctx0, Qcur, enc_d_heads, W * H, B * enc_n_heads); + Q = ggml_view_3d (ctx0, cur, enc_n_embd, W*H, B, cur->nb[2], cur->nb[3], 0*cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), enc_d_heads, enc_n_heads, W*H, B); + Q = ggml_cont (ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); // [B, enc_n_heads, H*W, enc_d_heads] - ggml_tensor * Kcur = - ggml_view_3d(ctx0, cur, enc_n_embd, W * H, B, cur->nb[1], cur->nb[2], cur->nb[3]); - Kcur = ggml_reshape_4d(ctx0, Kcur, enc_d_heads, enc_n_heads, W * H, B); - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - Kcur = ggml_reshape_3d(ctx0, Kcur, enc_d_heads, W * H, B * enc_n_heads); + K = ggml_view_3d (ctx0, cur, enc_n_embd, W*H, B, cur->nb[2], cur->nb[3], 1*cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), enc_d_heads, enc_n_heads, W*H, B); + K = ggml_cont (ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); // [B, enc_n_heads, H*W, enc_d_heads] - ggml_tensor * Vcur = - ggml_view_3d(ctx0, cur, enc_n_embd, W * H, B, cur->nb[1], cur->nb[2], 2 * cur->nb[3]); - Vcur = ggml_reshape_4d(ctx0, Vcur, enc_d_heads, enc_n_heads, W * H, B); - Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); // transposed - Vcur = ggml_reshape_3d(ctx0, Vcur, W * H, enc_d_heads, B * enc_n_heads); + V = ggml_view_3d (ctx0, cur, enc_n_embd, W*H, B, cur->nb[2], cur->nb[3], 2*cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), enc_d_heads, enc_n_heads, W*H, B); + V = ggml_cont (ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3)); // [B, enc_n_heads, H*W, enc_d_heads] - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + rw = get_rel_pos(ctx0, layer.rel_pos_w, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, H, H); // [H, H, C] + qr = ggml_reshape_4d(ctx0, Q, enc_d_heads, W, H, B*enc_n_heads); + const int WH_pad = GGML_PAD(W*H, GGML_KQ_MASK_PAD) - W*H; - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcur, Qcur); + rw = ggml_mul_mat (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*enc_n_heads, W, H, W] + rw = ggml_cont (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*enc_n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W*H, enc_n_heads*B); + rw = ggml_repeat_4d (ctx0, rw, W, H, W*H, enc_n_heads*B); + rh = ggml_mul_mat (ctx0, rh, qr); // [B*enc_n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W*H, enc_n_heads*B); + mask = ggml_add (ctx0, rw, rh); // [B*enc_n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, enc_n_heads, B); + mask = ggml_pad (ctx0, mask, 0, WH_pad, 0, 0); + mask = ggml_cast (ctx0, mask, GGML_TYPE_F16); - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf(enc_d_heads)); - - struct ggml_tensor * rw = get_rel_pos(ctx0, layer.rel_pos_w, W, W); - struct ggml_tensor * rh = get_rel_pos(ctx0, layer.rel_pos_h, H, H); - - struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Qcur, enc_d_heads, W, H, B * enc_n_heads); - - struct ggml_tensor * rel_w = ggml_cont(ctx0,ggml_permute(ctx0, - ggml_mul_mat(ctx0, - rw, - ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))), - 0, 2, 1, 3)); - struct ggml_tensor * rel_h = ggml_mul_mat(ctx0, rh, q_r); - - struct ggml_tensor * attn = add_rel_pos_inplace(ctx0, KQ_scaled, rel_w, rel_h); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, attn); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, Vcur, KQ_soft_max); - - cur = ggml_reshape_4d( - ctx0, - ggml_cont(ctx0, ggml_permute(ctx0, ggml_reshape_4d(ctx0, KQV, enc_d_heads, W * H, enc_n_heads, B), - 0, 2, 1, 3)), - enc_n_embd, W, H, B); + float scale = 1.0f / sqrtf((float)enc_d_heads); + cur = ggml_flash_attn_ext(ctx0, Q, K, V, mask, scale, 0.0f, 0.0f); // [B, H*W, enc_n_heads, enc_d_heads] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), enc_n_embd, W, H, B); cur = ggml_mul_mat(ctx0, layer.o_w, cur); cur = ggml_add_inplace(ctx0, cur, layer.o_b); } if (hparams.is_global_attn(il) == false) { // local attention layer - reverse window partition - cur = window_unpartition(ctx0, cur, w0, h0, 14); + cur = window_unpartition(ctx0, cur, w0, h0, 14); // TODO: make window size configurable } // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, shortcut); ggml_tensor * inpFF = cur; - - cb(inpFF, "ffn_inp", il); - // layernorm2 cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "ffn_inp_normed", il); // ffn cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il); - cb(cur, "ffn_out", il); - - // residual 2 cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "layer_out", il); + cb(cur, "sam_layer_out", il); } - cur = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 2, 0, 1, 3)); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - cur = ggml_conv_2d_sk_p0(ctx0, model.neck_0_w, cur); + const int out_chans = model.neck_0_w->ne[3]; - cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_1_w, model.neck_1_b, hparams.eps); + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = sam_layer_norm_2d(ctx0, cur, out_chans, model.neck_1_w, model.neck_1_b, hparams.eps); + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = sam_layer_norm_2d(ctx0, cur, out_chans, model.neck_3_w, model.neck_3_b, hparams.eps); - cur = ggml_conv_2d_s1_ph(ctx0, model.neck_2_w, cur); - - cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps); - - cur = ggml_conv_2d(ctx0, model.net_2, cur, 2,2,1,1, 1,1); - cur = ggml_conv_2d(ctx0, model.net_3, cur, 2,2,1,1, 1,1); + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); ggml_build_forward_expand(gf, cur); return cur; @@ -2576,19 +2562,27 @@ private: if (max_rel_dist != L) { // Linear interpolation - const auto scale = L / static_cast(max_rel_dist); - ggml_tensor * indices = ggml_arange(ctx, 0.0f, static_cast(max_rel_dist), 1.0f); - indices = ggml_scale_inplace(ctx, indices, scale); - ggml_tensor * indices_floor= ggml_cast(ctx, ggml_floor(ctx, indices), GGML_TYPE_I32); - ggml_tensor * indices_ceil = ggml_cast(ctx, ggml_ceil(ctx, indices), GGML_TYPE_I32); - ggml_tensor * weights = ggml_sub(ctx, indices, indices_floor); - ggml_tensor * ws1 = ggml_scale_bias(ctx, weights, -1.0f, 1.0f); - rel_pos_resized = ggml_cont(ctx , ggml_permute(ctx, rel_pos_resized, 1, 0, 2, 3)); // [C, L] for ggml_get_rows - ggml_tensor * rs1 = ggml_cont(ctx, ggml_permute(ctx, ggml_get_rows(ctx, rel_pos_resized, indices_floor), 1, 0, 2, 3)); // lower rows - rs1 = ggml_mul(ctx, rs1, ws1); // lower rows - ggml_tensor * rs2 = ggml_cont(ctx, ggml_permute(ctx, ggml_get_rows(ctx, rel_pos_resized, indices_ceil), 1, 0, 2, 3)); // upper rows - rs2 = ggml_mul(ctx, rs2, weights); // upper rows - rel_pos_resized = ggml_add(ctx,rs1, rs2); + int64_t ne0 = rel_pos_resized->ne[0]; + int64_t ne1 = rel_pos_resized->ne[1]; + int64_t ne2 = rel_pos_resized->ne[2]; + int64_t ne3 = rel_pos_resized->ne[3]; + + rel_pos_resized = ggml_reshape_3d( + ctx, + ggml_cont(ctx, ggml_permute(ctx, rel_pos_resized, 1, 0, 2, 3)), + ne1, 1, ne0*ne2*ne3 + ); + rel_pos_resized = ggml_reshape_4d( + ctx, + ggml_interpolate( + ctx, + rel_pos_resized, + max_rel_dist, 1, ne0*ne2*ne3, 1, + ggml_scale_mode::GGML_SCALE_MODE_BILINEAR + ), + max_rel_dist, ne0, ne2, ne3 + ); + rel_pos_resized = ggml_cont(ctx, ggml_permute(ctx, rel_pos_resized, 1, 0, 2, 3)); } // ------------------------------------------------- @@ -2627,7 +2621,7 @@ private: rel = ggml_sub(ctx, q_coord, k_coord); // [q_size, k_size] rel = ggml_scale_bias(ctx, rel, 1.0f, (k_size - 1.0f)*k_scale); // [q_size, k_size] // Clamp to [0, L-1] range for valid indexing - rel = ggml_clamp(ctx, rel, 0.0f, static_cast(rel_pos->ne[1] - 1)); + rel = ggml_clamp(ctx, rel, 0.0f, static_cast(rel_pos_resized->ne[1] - 1)); // ------------------------------------------------- // clamp to [0, L-1] and cast to int32 (for ggml_get_rows) @@ -2641,7 +2635,7 @@ private: // flatten to 1D for ggml_get_rows int qk = q_size * k_size; ggml_tensor * idx_flat = ggml_reshape_1d(ctx, idx_2d, qk); // [qk] - ggml_tensor * gathered = ggml_get_rows(ctx, rel_pos, idx_flat); // [qk, C] + ggml_tensor * gathered = ggml_get_rows(ctx, rel_pos_resized, idx_flat); // [qk, C] // ------------------------------------------------- // Gather from rel_pos → [qk, C] @@ -2671,7 +2665,7 @@ private: } x = ggml_reshape_4d(ctx, x, c * window, npw, window, nph * b); x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); - x = ggml_reshape_4d(ctx, x, c, window ,window, npw * nph * b); + x = ggml_reshape_4d(ctx, x, c, window, window, npw * nph * b); return x; } @@ -5078,6 +5072,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const int orig_w = original_size.width; const int orig_h = original_size.height; const int orig_area = orig_h * orig_w; + std::array color; + + for (int i = 0; i < 3; i++) { + color[i] = (int)(255 * params.image_mean[i]); + } // mode selection logic (find most suitable resolution) int mode_i = 0; @@ -5100,7 +5099,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8_ptr resized_img(clip_image_u8_init()); img_tool::resize(*img, *resized_img, clip_image_size{image_size, image_size}, - img_tool::RESIZE_ALGO_BICUBIC); // Match PIL default + img_tool::RESIZE_ALGO_BICUBIC, true, color); // Match PIL default clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(*resized_img, *res, params.image_mean, params.image_std); @@ -5122,7 +5121,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8_ptr scaled_img(clip_image_u8_init()); img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h}, - img_tool::RESIZE_ALGO_BICUBIC); + img_tool::RESIZE_ALGO_BICUBIC, true, color); // Use mean color for padding unsigned char pad_r = static_cast(params.image_mean[0] * 255.0f); @@ -5801,8 +5800,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (ggml_tensor * t : ctx->debug_print_tensors) { std::vector data(ggml_nbytes(t)); ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + print_tensor_info(t); print_tensor_shape(t); - print_tensor_data(t, data.data(), 3); + print_tensor_sum(t, data.data(), 3); + std::string tname_s = std::string(t->name); + + bool is_stored = false; + std::vector patterns = { + /* Add tensor names here to dump (e.g. "sam_output") */ + "sam_output" + }; + + for (auto & p : patterns) { + if (tname_s == p) { + save_tensor_to_file(t); + is_stored = true; + break; + } + } + + if (!is_stored) { + print_tensor_data(t, data.data(), 3); + } } }