diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 1b9e8a2464..3c4388f8a5 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -39,6 +39,7 @@
 #include <string_view>
 #include <thread>
 #include <vector>
+#include <map>
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
     size_t nels = ggml_nelements(tensor);
@@ -6725,14 +6726,66 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
         }
     }
 
-    for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-        for (auto act_case : cases) {
-            // Direct CONV_2D
-            test_cases.emplace_back(new test_conv_2d_implicit(
-                { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
-                { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
-                kernel_type, 1, 1, 0, 0, 1, 1, false));
-        }
+    // for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+    //     for (auto act_case : cases) {
+    //         // Direct CONV_2D
+    //         test_cases.emplace_back(new test_conv_2d_implicit(
+    //             { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
+    //             { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
+    //             kernel_type, 1, 1, 0, 0, 1, 1, false));
+    //     }
+    // }
+
+    // Stable-diffusion layers
+    std::map<std::string, uint32_t> idx_sd{
+        { "iw",   0 },
+        { "ih",   1 },
+        { "kw",   2 },
+        { "kh",   3 },
+        { "Cout", 4 },
+        { "Cin",  5 },
+        { "B",    6 },
+    };
+
+    // Input image size
+    uint32_t w = 768;
+    uint32_t h = 1024;
+
+    // Number of filters (base)
+    uint32_t Cout_b = 128;
+    uint32_t Cin_b  = 128;
+
+    std::vector<std::array<uint32_t, 7>> cases_sd = {
+        { w / 8, h / 8, 3, 3, Cout_b * 4, Cin_b * 4, 1 }, // x10 (called 10 times)
+        { w / 4, h / 4, 3, 3, Cout_b * 4, Cin_b * 4, 1 }, // x7
+        { w / 2, h / 2, 3, 3, Cout_b * 2, Cin_b * 2, 1 }, // x5
+        { w,     h,     3, 3, Cout_b,     Cin_b,     1 }, // x5
+        { w / 8, h / 8, 1, 1, Cout_b * 4, Cin_b * 4, 1 }, // x4
+        { w / 8, h / 8, 1, 1, 4,          4,         1 },
+        { w / 8, h / 8, 3, 3, Cout_b * 4, 4,         1 },
+
+        { w / 2, h / 2, 3, 3, Cout_b * 4, Cin_b * 4, 1 },
+        { w / 2, h / 2, 3, 3, Cout_b * 2, Cin_b * 4, 1 },
+        { w / 2, h / 2, 1, 1, Cout_b * 2, Cin_b * 4, 1 },
+
+        { w,     h,     3, 3, Cout_b,     Cin_b * 2, 1 },
+        { w,     h,     1, 1, Cout_b,     Cin_b * 2, 1 },
+        { w,     h,     3, 3, Cout_b * 2, Cin_b * 2, 1 },
+
+        { w,     h,     3, 3, 3,          Cin_b,     1 },
+    };
+
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
     }
 
 
diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp
index 6077299cb4..e963e2b361 100644
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@@ -63,8 +63,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
 
     size_t buffer_size = 0;
     {
-        // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
-        buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
+        buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
+        // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
         buffer_size += IW * IH * IC * N  * ggml_type_size(GGML_TYPE_F32); // tensor b
         buffer_size += 1024; // overhead
     }
@@ -112,7 +112,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
     model.ctx = ggml_init(params);
 
     // create tensors
-    model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, IC, OC);
+    // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, IC, OC);
+    model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32,  KW, KH, IC, OC);
     model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
 
     // create a allocator
@@ -123,9 +124,11 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
 
     // load data to buffer
     if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
+        // memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
+        memcpy(model.a->data, adata.data(), ggml_nbytes(model.a));
     } else {
-        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
+        // ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
+        ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a));
     }
 
     // alloc memory