Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr

2025-11-22 02:22:48 +00:00 · 2025-11-22 02:22:48 +00:00 · f8f66a151b
parent effe66958e 86f111f8b7
commit f8f66a151b
1 changed files with 7 additions and 4 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -820,6 +820,7 @@ struct clip_graph {
        // TODO: better implementation
        layer = ggml_permute(ctx0, ggml_norm(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, layer, 1, 2, 0, 3)), eps), 2, 0,
                             1, 3);
+        layer = ggml_cont(ctx0, layer);

        layer =
            ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, w, 1, 1, n_channels), layer), layer),
@ -1538,8 +1539,7 @@ struct clip_graph {
        GGML_ASSERT(model.position_embeddings != nullptr);

        const int     n_pos = n_patches + 1;
-        ggml_tensor * inp = ggml_permute(ctx0, patch_embeds,2,1,0,3);
-        inp = ggml_cont(ctx0, inp);
+        ggml_tensor * inp = ggml_cont(ctx0,ggml_permute(ctx0, patch_embeds,2,1,0,3));
        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);


@ -1551,7 +1551,7 @@ struct clip_graph {
        norm_type norm_t  = NORM_TYPE_NORMAL;

        // for selecting learned pos embd, used by ViT
-        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
        cb(positions, "positions", -1);
        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);

@ -5237,7 +5237,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            } break;
        case PROJECTOR_TYPE_DEEPSEEKOCR:
        {
-            n_patches += 2;
+            int x_patch = img->nx / (params.patch_size);
+
+            n_patches += x_patch + 1;
+
        } break;
        default:
            GGML_ABORT("unsupported projector type");