From d981f19e9dac61fad57e7ee8aafea62a35eeb8b8 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Fri, 5 Dec 2025 13:18:15 +0100 Subject: [PATCH] minor editorconfig-check fixes --- tools/mtmd/clip-impl.h | 10 ++++--- tools/mtmd/clip.cpp | 62 +++++++++++++++++++++--------------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 3e9b5639ba..b8bf5ac899 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -457,7 +457,7 @@ static std::string to_ne_string(const ggml_tensor * t) { static void print_tensor_info(ggml_tensor * t) { const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; - + char src1_str[128] = {0}; if (src1) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, to_ne_string(src1).c_str()); @@ -643,7 +643,7 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t append_str(" ["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; + float v; if (type == GGML_TYPE_F16) { v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); } else if (type == GGML_TYPE_F32) { @@ -659,13 +659,15 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t } int len = snprintf(num_buf, sizeof(num_buf), "%8.4f", v); append(num_buf, len); - if (i0 < ne[0] - 1) append_str(", "); + if (i0 < ne[0] - 1) { + append_str(", "); + } } append_str("],\n"); } append_str(" ],\n"); } - append_str(" ]"); // End of batch + append_str(" ]"); // End of batch if (i3 < ne[3] - 1) { append_str(",\n"); // Comma between batches } else { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 8fcedafeda..80eb998ad0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -662,19 +662,19 @@ struct clip_graph { ggml_cgraph * build_deepseek_ocr() { //patch embedding - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * sam_out = build_sam(inp_raw); + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * sam_out = build_sam(inp_raw); ggml_tensor * clip_out = build_dsocr_clip(sam_out); - + int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; - - sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); - sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); + + sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); + sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); - + ggml_tensor * cur; cur = ggml_concat(ctx0, clip_out, sam_out, 0); - cur = ggml_reshape_2d(ctx0, cur, 2*n_embd,clip_n_patches); + cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); cur = ggml_cont(ctx0, cur); cur = ggml_mul_mat(ctx0, model.fc_w, cur); cur = ggml_add(ctx0, cur, model.fc_b); @@ -687,10 +687,10 @@ struct clip_graph { ggml_tensor * vs; imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) - cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); - cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w+1)*h); - cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) + cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); + cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); + cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) cb(cur, "dsocr_output", -1); @@ -2156,7 +2156,7 @@ private: ggml_tensor * Qcur; ggml_tensor * Kcur; ggml_tensor * Vcur; - + if (layer.qkv_w) { ggml_tensor * QKV; @@ -2181,12 +2181,12 @@ private: if (layer.q_b) { Qcur = ggml_add(ctx0, Qcur, layer.q_b); } - + Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); if (layer.k_b) { Kcur = ggml_add(ctx0, Kcur, layer.k_b); } - + Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); if (layer.v_b) { Vcur = ggml_add(ctx0, Vcur, layer.v_b); @@ -2591,7 +2591,7 @@ private: } else { ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); v = ggml_cont(ctx0, v); - + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // F32 may not needed for vision encoders? // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); @@ -2727,11 +2727,11 @@ private: const int d_heads = n_embd / n_heads; ggml_tensor * inpL; - + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); - + ggml_tensor * cur; const auto tgt_size = inpL->ne[1]; const auto str_size = model.pos_embed->ne[1]; @@ -2776,7 +2776,7 @@ private: // self-attention { const int B = cur->ne[3]; - + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); cur = ggml_add(ctx0, cur, layer.qkv_b); cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape @@ -2853,7 +2853,7 @@ private: cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); @@ -2883,7 +2883,7 @@ private: if (tgt_size != src_size) { ggml_tensor * old_pos_embd; ggml_tensor * cls_tok; - + old_pos_embd = ggml_view_2d( ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size, @@ -2912,7 +2912,7 @@ private: ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); - ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK, + ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK, learned_pos_embd, nullptr); // shape [1024, 16, 16] ggml_build_forward_expand(gf, cur); @@ -5193,11 +5193,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const int orig_h = original_size.height; const int orig_area = orig_h * orig_w; std::array color; - + for (int i = 0; i < 3; i++) { color[i] = (int)(255 * params.image_mean[i]); } - + int mode_i = 0; int min_diff = orig_area; @@ -5212,7 +5212,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str if (mode_i < 2) { /* Native Resolution (Tiny/Small) */ const int image_size = native_resolutions[mode_i]; - + // Just resize the image to image_size × image_size clip_image_u8_ptr resized_img(clip_image_u8_init()); img_tool::resize(*img, *resized_img, @@ -5229,7 +5229,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str else if (mode_i < 4) { /* Native Resolution (Base/Large) */ const int image_size = native_resolutions[mode_i]; - + // Resize maintaining aspect ratio, then pad to square float scale = std::min( static_cast(image_size) / orig_w, @@ -5286,7 +5286,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str else { GGML_ABORT("DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet"); /* Dynamic Resolution (Gundam/Gundam-Master) */ - + // configurable, or read from params const int min_num = 2; const int max_num = 9; @@ -5295,10 +5295,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // original image size const int orig_w = original_size.width; const int orig_h = original_size.height; - + // create overview image (thumbnail) clip_image_u8_ptr overview_img(clip_image_u8_init()); - img_tool::resize(*img, *overview_img, { image_size, image_size }, + img_tool::resize(*img, *overview_img, { image_size, image_size }, img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color); clip_image_f32_ptr overview_f32(clip_image_f32_init()); normalize_image_u8_to_f32(*overview_img, *overview_f32, params.image_mean, params.image_std); @@ -5306,7 +5306,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // build candidate grids (cols, rows) auto target_ratios = ds_build_target_ratios(min_num, max_num); - + // pick the grid that best matches the original aspect ratio const float aspect_ratio = static_cast(orig_w) / static_cast(orig_h); auto best = ds_find_closest_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size); @@ -5315,7 +5315,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // resize to refined size (no padding, direct resize) clip_image_u8_ptr refined_img(clip_image_u8_init()); - img_tool::resize(*img, *refined_img, { image_size * grid_cols, image_size * grid_rows }, + img_tool::resize(*img, *refined_img, { image_size * grid_cols, image_size * grid_rows }, img_tool::RESIZE_ALGO_BICUBIC_PILLOW, false); // crop slices from the refined image