use Vx instead of _VECx for vector preprocessor types

2026-03-19 11:41:45 +01:00 · 2026-03-19 11:41:45 +01:00 · cca358b43b
parent b486c17b3e
commit cca358b43b
10 changed files with 167 additions and 167 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl
@ -6,8 +6,8 @@
 #define MAT_VEC_FUSION_FLAGS_SCALE1 0x8

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-#if defined(A_TYPE_VEC4)
-layout (binding = 0) readonly buffer AV4 {A_TYPE_VEC4 data_a_v4[];};
+#if defined(A_TYPEV4)
+layout (binding = 0) readonly buffer AV4 {A_TYPEV4 data_a_v4[];};
 #endif
 #if defined(A_TYPE_PACKED16)
 layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
@ -17,11 +17,11 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32
 #endif

 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-#ifdef B_TYPE_VEC2
-layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
+#ifdef B_TYPEV2
+layout (binding = 1) readonly buffer BV2 {B_TYPEV2 data_b_v2[];};
 #endif
-#ifdef B_TYPE_VEC4
-layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
+#ifdef B_TYPEV4
+layout (binding = 1) readonly buffer BV4 {B_TYPEV4 data_b_v4[];};
 #endif

 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -41,7 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));

-        const FLOAT_TYPE_VEC2 dm = vec2(data_a[ib0 + i].dm);
+        const FLOAT_TYPEV2 dm = vec2(data_a[ib0 + i].dm);

        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@ -14,7 +14,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
-        const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
+        const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);

        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@ -14,7 +14,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
-        const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
+        const FLOAT_TYPEV2 dm = FLOAT_TYPEV2(data_a[ib0 + i].dm);

        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
@ -11,8 +11,8 @@ FLOAT_TYPE get_dm(uint ib) {
 #endif

 #if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
-FLOAT_TYPE_VEC2 get_dm(uint ib) {
-    return FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
+FLOAT_TYPEV2 get_dm(uint ib) {
+    return FLOAT_TYPEV2(data_a_packed32[ib].dm);
 }
 #endif

@ -23,9 +23,9 @@ FLOAT_TYPE get_dm(uint ib) {
 #endif

 #if defined(DATA_A_Q2_K)
-FLOAT_TYPE_VEC2 get_dm(uint ib) {
+FLOAT_TYPEV2 get_dm(uint ib) {
    const uint ib_k = ib / 8;
-    return FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm);
+    return FLOAT_TYPEV2(data_a_packed32[ib_k].dm);
 }
 #endif

@ -304,7 +304,7 @@ vec2 get_dm_scale(uint ib, uint iqs) {
                          (data_a[ib_k].scales[is+4] >>  4) | ((data_a[ib_k].scales[is  ] & 0xC0) >> 2));
    }

-    return FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm);
+    return FLOAT_TYPEV2(data_a_packed32[ib_k].dm) * FLOAT_TYPEV2(scale_dm);
 }

 FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
@ -422,7 +422,7 @@ vec2 get_dm(uint ib, uint iqs) {
    const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);

    // the -1 cancels out the bias in iq1s_grid_gpu
-    return FLOAT_TYPE_VEC2(dl, dl * (delta - 1));
+    return FLOAT_TYPEV2(dl, dl * (delta - 1));
 }

 FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@ -125,8 +125,8 @@ layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working wit
 #define SHMEM_STRIDE (BK / 2 + 1)
 #endif

-shared FLOAT_TYPE_VEC2 buf_a[BM * SHMEM_STRIDE];
-shared FLOAT_TYPE_VEC2 buf_b[BN * SHMEM_STRIDE];
+shared FLOAT_TYPEV2 buf_a[BM * SHMEM_STRIDE];
+shared FLOAT_TYPEV2 buf_b[BN * SHMEM_STRIDE];

 #define NUM_WARPS (BLOCK_SIZE / WARP)

@ -258,17 +258,17 @@ void main() {
        sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
    }
 #else
-    ACC_TYPE_VEC2 sums[WMITER * TM * WNITER * TN/2];
+    ACC_TYPEV2 sums[WMITER * TM * WNITER * TN/2];
 #if defined(DATA_A_F32) || defined(DATA_A_F16)
-    FLOAT_TYPE_VEC4 cache_a[WMITER * TM];
-    FLOAT_TYPE_VEC4 cache_b;
+    FLOAT_TYPEV4 cache_a[WMITER * TM];
+    FLOAT_TYPEV4 cache_b;
 #else
-    FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
-    FLOAT_TYPE_VEC2 cache_b;
+    FLOAT_TYPEV2 cache_a[WMITER * TM];
+    FLOAT_TYPEV2 cache_b;
 #endif

    [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
-        sums[i] = ACC_TYPE_VEC2(0.0f, 0.0f);
+        sums[i] = ACC_TYPEV2(0.0f, 0.0f);
    }
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
@ -3,7 +3,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
 #if LOAD_VEC_A == 8
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
-            FLOAT_TYPE_VEC8 aa = FLOAT_TYPE_VEC8(data_a[idx]);
+            FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
            buf_a[buf_idx    ] = aa[0].xy;
            buf_a[buf_idx + 1] = aa[0].zw;
            buf_a[buf_idx + 2] = aa[1].xy;
@ -11,38 +11,38 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
 #elif LOAD_VEC_A == 4
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
-            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]);
+            FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
            buf_a[buf_idx    ] = aa.xy;
            buf_a[buf_idx + 1] = aa.zw;
 #else // LOAD_VEC_BATCH_A == 2
            const uint idx = pos_a + col * p.stride_a + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_m < p.M && block + row * 2 + 1 < end_k) {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(data_a[idx],
-                                                 data_a[idx + 1]);
+                buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx],
+                                              data_a[idx + 1]);
            } else if (idx_m < p.M && block + row * 2 < end_k) {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(data_a[idx], 0.0f);
+                buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx], 0.0f);
            } else {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(0.0f);
+                buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
            }
 #endif
 #elif defined(DATA_A_BF16)
 #if LOAD_VEC_A == 4
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
-            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx]));
+            FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
            buf_a[buf_idx    ] = aa.xy;
            buf_a[buf_idx + 1] = aa.zw;
 #else // LOAD_VEC_BATCH_A == 2
            const uint idx = pos_a + col * p.stride_a + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_m < p.M && block + row * 2 + 1 < end_k) {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_a[idx]),
-                                                 TO_FLOAT_TYPE(data_a[idx + 1]));
+                buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]),
+                                              TO_FLOAT_TYPE(data_a[idx + 1]));
            } else if (idx_m < p.M && block + row * 2 < end_k) {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_a[idx]), 0.0f);
+                buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]), 0.0f);
            } else {
-                buf_a[buf_idx] = FLOAT_TYPE_VEC2(0.0f);
+                buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
            }
 #endif
 #elif defined(DATA_A_Q4_0)
@ -57,10 +57,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
            const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v0.xy);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v0.zw);
-            buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v1.xy);
-            buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.zw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v0.xy);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(v0.zw);
+            buf_a[buf_idx + 8] = FLOAT_TYPEV2(v1.xy);
+            buf_a[buf_idx + 9] = FLOAT_TYPEV2(v1.zw);
 #elif defined(DATA_A_Q4_1)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@ -73,10 +73,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * dm.x + dm.y;
            const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * dm.x + dm.y;

-            buf_a[buf_idx     ] = FLOAT_TYPE_VEC2(v0.xy);
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE_VEC2(v0.zw);
-            buf_a[buf_idx + 8 ] = FLOAT_TYPE_VEC2(v1.xy);
-            buf_a[buf_idx + 9 ] = FLOAT_TYPE_VEC2(v1.zw);
+            buf_a[buf_idx     ] = FLOAT_TYPEV2(v0.xy);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPEV2(v0.zw);
+            buf_a[buf_idx + 8 ] = FLOAT_TYPEV2(v1.xy);
+            buf_a[buf_idx + 9 ] = FLOAT_TYPEV2(v1.zw);
 #elif defined(DATA_A_Q5_0)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@ -92,8 +92,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
            const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v.xz);
-            buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v.yw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v.xz);
+            buf_a[buf_idx + 8] = FLOAT_TYPEV2(v.yw);
 #elif defined(DATA_A_Q5_1)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@ -112,10 +112,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 v0 = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * dm.x + dm.y;
            const vec4 v1 = vec4(((vui >> 16) & 0xF) | qh2.x, ((vui >> 20) & 0xF) | qh2.y, ((vui >> 24) & 0xF) | qh3.x, ((vui >> 28) & 0xF) | qh3.y) * dm.x + dm.y;

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v0.xz);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v1.xz);
-            buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v0.yw);
-            buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.yw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v0.xz);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(v1.xz);
+            buf_a[buf_idx + 8] = FLOAT_TYPEV2(v0.yw);
+            buf_a[buf_idx + 9] = FLOAT_TYPEV2(v1.yw);
 #elif defined(DATA_A_Q8_0)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -128,8 +128,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
            const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v.xy);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v.xy);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(v.zw);
 #elif defined(DATA_A_Q2_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -147,8 +147,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin

            const vec4 v = dm.x * float(scales & 0xF) * qs - dm.y * float(scales >> 4);

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v.xy);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v.xy);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(v.zw);
 #elif defined(DATA_A_Q3_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -171,8 +171,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec2 qs = vec2(unpack8((uint(data_a_packed16[ib].qs[qsi / 2]) >> qsshift) & 0x0303).xy);
            const vec2 hm = vec2(unpack8(((uint(data_a_packed16[ib].hmask[hmi / 2]) >> (4 * n + halfsplit)) & 0x0101 ^ 0x0101) << 2).xy);

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * (qs.x - hm.x),
-                                             dl * (qs.y - hm.y));
+            buf_a[buf_idx] = FLOAT_TYPEV2(dl * (qs.x - hm.x),
+                                          dl * (qs.y - hm.y));
 #elif defined(DATA_A_Q4_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -206,8 +206,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin

            const vec4 q = vec4(unpack8((data_a_packed32[ib].qs[qsi / 4] >> (b * 4)) & 0x0F0F0F0F));

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(fma(d, q.x, m), fma(d, q.y, m));
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(fma(d, q.z, m), fma(d, q.w, m));
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(fma(d, q.x, m), fma(d, q.y, m));
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(fma(d, q.z, m), fma(d, q.w, m));
 #elif defined(DATA_A_Q5_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -244,8 +244,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint qh = ((data_a_packed32[ib].qh[qhi / 4] >> (iqs / 16)) & 0x01010101) << 4;
            const vec4 q = vec4(unpack8(qs | qh));

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(fma(d, q.x, m), fma(d, q.y, m));
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(fma(d, q.z, m), fma(d, q.w, m));
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(fma(d, q.x, m), fma(d, q.y, m));
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(fma(d, q.z, m), fma(d, q.w, m));
 #elif defined(DATA_A_Q6_K)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -267,7 +267,7 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
            const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
+            buf_a[buf_idx] = FLOAT_TYPEV2(q.x, q.y);
 #elif defined(DATA_A_IQ1_S)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -284,8 +284,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);

            [[unroll]] for (int k = 0; k < 4; ++k) {
-                buf_a[buf_idx + k] = FLOAT_TYPE_VEC2(dl * (bitfieldExtract(grid, 4 * k    , 2) + delta),
-                                                     dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
+                buf_a[buf_idx + k] = FLOAT_TYPEV2(dl * (bitfieldExtract(grid, 4 * k    , 2) + delta),
+                                                  dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
            }
 #elif defined(DATA_A_IQ1_M)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
@ -306,8 +306,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);

            [[unroll]] for (int k = 0; k < 4; ++k) {
-                buf_a[buf_idx + k] = FLOAT_TYPE_VEC2(dl * (bitfieldExtract(grid, 4 * k    , 2) + delta),
-                                                     dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
+                buf_a[buf_idx + k] = FLOAT_TYPEV2(dl * (bitfieldExtract(grid, 4 * k    , 2) + delta),
+                                                  dl * (bitfieldExtract(grid, 4 * k + 2, 2) + delta));
            }
 #elif defined(DATA_A_IQ2_XXS)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
@ -332,14 +332,14 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 grid0 = vec4(unpack8(grid.x));
            const vec4 grid1 = vec4(unpack8(grid.y));

-            buf_a[buf_idx    ] = db * FLOAT_TYPE_VEC2((sign &   1) != 0 ? -grid0.x : grid0.x,
-                                                      (sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE_VEC2((sign &   4) != 0 ? -grid0.z : grid0.z,
-                                                      (sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE_VEC2((sign &  16) != 0 ? -grid1.x : grid1.x,
-                                                      (sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE_VEC2((sign &  64) != 0 ? -grid1.z : grid1.z,
-                                                      (sign & 128) != 0 ? -grid1.w : grid1.w);
+            buf_a[buf_idx    ] = db * FLOAT_TYPEV2((sign &   1) != 0 ? -grid0.x : grid0.x,
+                                                   (sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPEV2((sign &   4) != 0 ? -grid0.z : grid0.z,
+                                                   (sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPEV2((sign &  16) != 0 ? -grid1.x : grid1.x,
+                                                   (sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign &  64) != 0 ? -grid1.z : grid1.z,
+                                                   (sign & 128) != 0 ? -grid1.w : grid1.w);
 #elif defined(DATA_A_IQ2_XS)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -358,14 +358,14 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 grid0 = vec4(unpack8(grid.x));
            const vec4 grid1 = vec4(unpack8(grid.y));

-            buf_a[buf_idx    ] = db * FLOAT_TYPE_VEC2((sign &   1) != 0 ? -grid0.x : grid0.x,
-                                                      (sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE_VEC2((sign &   4) != 0 ? -grid0.z : grid0.z,
-                                                      (sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE_VEC2((sign &  16) != 0 ? -grid1.x : grid1.x,
-                                                      (sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE_VEC2((sign &  64) != 0 ? -grid1.z : grid1.z,
-                                                      (sign & 128) != 0 ? -grid1.w : grid1.w);
+            buf_a[buf_idx    ] = db * FLOAT_TYPEV2((sign &   1) != 0 ? -grid0.x : grid0.x,
+                                                   (sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPEV2((sign &   4) != 0 ? -grid0.z : grid0.z,
+                                                   (sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPEV2((sign &  16) != 0 ? -grid1.x : grid1.x,
+                                                   (sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign &  64) != 0 ? -grid1.z : grid1.z,
+                                                   (sign & 128) != 0 ? -grid1.w : grid1.w);
 #elif defined(DATA_A_IQ2_S)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -386,14 +386,14 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const vec4 grid0 = vec4(unpack8(grid.x));
            const vec4 grid1 = vec4(unpack8(grid.y));

-            buf_a[buf_idx    ] = db * FLOAT_TYPE_VEC2((sign &   1) != 0 ? -grid0.x : grid0.x,
-                                                      (sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE_VEC2((sign &   4) != 0 ? -grid0.z : grid0.z,
-                                                      (sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE_VEC2((sign &  16) != 0 ? -grid1.x : grid1.x,
-                                                      (sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE_VEC2((sign &  64) != 0 ? -grid1.z : grid1.z,
-                                                      (sign & 128) != 0 ? -grid1.w : grid1.w);
+            buf_a[buf_idx    ] = db * FLOAT_TYPEV2((sign &   1) != 0 ? -grid0.x : grid0.x,
+                                                   (sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPEV2((sign &   4) != 0 ? -grid0.z : grid0.z,
+                                                   (sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPEV2((sign &  16) != 0 ? -grid1.x : grid1.x,
+                                                   (sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPEV2((sign &  64) != 0 ? -grid1.z : grid1.z,
+                                                   (sign & 128) != 0 ? -grid1.w : grid1.w);
 #elif defined(DATA_A_IQ3_XXS)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -414,10 +414,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint grid = iq3xxs_grid[qs];
            const vec4 v = db * vec4(unpack8(grid));

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2((sign &   1) != 0 ? -v.x : v.x,
-                                                 (sign &   2) != 0 ? -v.y : v.y);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2((sign &   4) != 0 ? -v.z : v.z,
-                                                 (sign &   8) != 0 ? -v.w : v.w);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2((sign &   1) != 0 ? -v.x : v.x,
+                                              (sign &   2) != 0 ? -v.y : v.y);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2((sign &   4) != 0 ? -v.z : v.z,
+                                              (sign &   8) != 0 ? -v.w : v.w);
 #elif defined(DATA_A_IQ3_S)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -436,10 +436,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
            const vec4 v = db * vec4(unpack8(grid));

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2((sign &   1) != 0 ? -v.x : v.x,
-                                                 (sign &   2) != 0 ? -v.y : v.y);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2((sign &   4) != 0 ? -v.z : v.z,
-                                                 (sign &   8) != 0 ? -v.w : v.w);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2((sign &   1) != 0 ? -v.x : v.x,
+                                              (sign &   2) != 0 ? -v.y : v.y);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2((sign &   4) != 0 ? -v.z : v.z,
+                                              (sign &   8) != 0 ? -v.w : v.w);
 #elif defined(DATA_A_IQ4_XS)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
@ -456,8 +456,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const float d = float(data_a[ib].d);
            const vec4 v = d * float(int(sl | (sh << 4)) - 32) * vec4(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y], kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(v.xy);
-            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(v.xy);
+            buf_a[buf_idx + 1] = FLOAT_TYPEV2(v.zw);
 #elif defined(DATA_A_IQ4_NL)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@ -468,10 +468,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
            const uint vui = uint(data_a_packed16[ib].qs[iqs]);

-            buf_a[buf_idx    ] = d * FLOAT_TYPE_VEC2(kvalues_iq4nl[vui & 0xF],
-                                                      kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]);
-            buf_a[buf_idx + 8] = d * FLOAT_TYPE_VEC2(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)],
-                                                     kvalues_iq4nl[vui >> 12]);
+            buf_a[buf_idx    ] = d * FLOAT_TYPEV2(kvalues_iq4nl[vui & 0xF],
+                                                  kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]);
+            buf_a[buf_idx + 8] = d * FLOAT_TYPEV2(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)],
+                                                  kvalues_iq4nl[vui >> 12]);
 #elif defined(DATA_A_MXFP4)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@ -483,10 +483,10 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint vui = uint(data_a[ib].qs[iqs]);
            const uint vui2 = uint(data_a[ib].qs[iqs+1]);

-            buf_a[buf_idx    ] = FLOAT_TYPE_VEC2(kvalues_mxfp4[vui  & 0xF] * d,
-                                                 kvalues_mxfp4[vui2 & 0xF] * d);
-            buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(kvalues_mxfp4[vui  >>  4] * d,
-                                                 kvalues_mxfp4[vui2 >>  4] * d);
+            buf_a[buf_idx    ] = FLOAT_TYPEV2(kvalues_mxfp4[vui  & 0xF] * d,
+                                              kvalues_mxfp4[vui2 & 0xF] * d);
+            buf_a[buf_idx + 8] = FLOAT_TYPEV2(kvalues_mxfp4[vui  >>  4] * d,
+                                              kvalues_mxfp4[vui2 >>  4] * d);
 #endif
 }

@ -496,7 +496,7 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            // Not supported for b_type bf16 because bf16mat2x4 does not exist
            const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
-            FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
+            FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
            buf_b[buf_idx + 0] = bb[0].xy;
            buf_b[buf_idx + 1] = bb[0].zw;
            buf_b[buf_idx + 2] = bb[1].xy;
@ -505,9 +505,9 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
 #if defined(DATA_B_BF16)
-            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
+            FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
 #else
-            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
+            FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
 #endif
            buf_b[buf_idx + 0] = bb.xy;
            buf_b[buf_idx + 1] = bb.zw;
@ -515,12 +515,12 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            const uint idx = pos_b + col * p.stride_b + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_n < p.N && block + row * 2 + 1 < end_k) {
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
-                                                 TO_FLOAT_TYPE(data_b[idx + 1]));
+                buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
+                                              TO_FLOAT_TYPE(data_b[idx + 1]));
            } else if (idx_n < p.N && block + row * 2 < end_k) {
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
+                buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
            } else {
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(0.0f);
+                buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
            }
 #endif
 }
@ -531,7 +531,7 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            const u16vec2 row_idx = row_ids[col];
            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
-            FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
+            FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
            buf_b[buf_idx + 0] = bb[0].xy;
            buf_b[buf_idx + 1] = bb[0].zw;
            buf_b[buf_idx + 2] = bb[1].xy;
@ -541,9 +541,9 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
 #if defined(DATA_B_BF16)
-            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
+            FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
 #else
-            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
+            FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
 #endif
            buf_b[buf_idx + 0] = bb.xy;
            buf_b[buf_idx + 1] = bb.zw;
@ -553,14 +553,14 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
            if (row_i < _ne1 && block + row * 2 + 1 < end_k) {
                const u16vec2 row_idx = row_ids[col];
                const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
-                                                 TO_FLOAT_TYPE(data_b[idx + 1]));
+                buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
+                                              TO_FLOAT_TYPE(data_b[idx + 1]));
            } else if (row_i < _ne1 && block + row * 2 < end_k) {
                const u16vec2 row_idx = row_ids[col];
                const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
+                buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
            } else {
-                buf_b[buf_idx] = FLOAT_TYPE_VEC2(0.0f);
+                buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
            }
 #endif
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
@ -21,7 +21,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
    buf_a[buf_ib].qs[iqs] = data_a_packed32[ib].qs[iqs];

    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
+        buf_a[buf_ib].dm = FLOAT_TYPEV2(data_a_packed32[ib].dm);
    }
 #endif
 }
@ -72,7 +72,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
    buf_a[buf_ib].qs[iqs] = data_a_packed32[ib].qs[iqs];

    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib].dm);
+        buf_a[buf_ib].dm = FLOAT_TYPEV2(data_a_packed32[ib].dm);
        buf_a[buf_ib].qh = data_a_packed32[ib].qh;
    }
 #endif
@ -203,7 +203,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
    buf_a[buf_ib].qs[iqs] = vals0 | (vals1 << 2) | (vals2 << 4) | (vals3 << 6);

    if (iqs == 0) {
-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm);
+        buf_a[buf_ib].dm = FLOAT_TYPEV2(data_a_packed32[ib_k].dm);
        buf_a[buf_ib].scales = unpack8(uint32_t(data_a_packed16[ib_k].scales[iqs_k / 8])).xy; // vec4 used due to #12147
    }
 }
@ -264,7 +264,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
        const i8vec2 scales = i8vec2(unpack8(uint32_t(((data_a_packed16[ib_k].scales[(is % 8      ) / 2] >> (4 * (is / 8))) & 0x0F0F) |
                                                     (((data_a_packed16[ib_k].scales[(8 + (is % 4)) / 2] >> (2 * (is / 4))) & 0x0303) << 4))).xy); // vec4 used due to #12147

-        buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales - 32));
+        buf_a[buf_ib].d_scales = FLOAT_TYPEV2(float(data_a_packed16[ib_k].d) * vec2(scales - 32));
    }
 }

@ -334,7 +334,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
                              (data_a[ib_k].scales[is+4] >>  4) | ((data_a[ib_k].scales[is  ] & 0xC0) >> 2));
        }

-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(vec2(data_a_packed32[ib_k].dm) * vec2(scale_dm));
+        buf_a[buf_ib].dm = FLOAT_TYPEV2(vec2(data_a_packed32[ib_k].dm) * vec2(scale_dm));
    }
 }

@ -385,7 +385,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
        const uint is = iqs_k / 4;
        const i8vec2 scales = unpack8(int32_t(data_a_packed16[ib_k].scales[is / 2])).xy;

-        buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales));
+        buf_a[buf_ib].d_scales = FLOAT_TYPEV2(float(data_a_packed16[ib_k].d) * vec2(scales));
    }
 }

@ -426,7 +426,7 @@ void block_b_to_shmem(const uint buf_ib, const uint ib, const uint iqs, const bo
        const uint ib_inner = ib % 4;

        if (iqs == 0) {
-            buf_b[buf_ib].ds = FLOAT_TYPE_VEC2(data_b[ib_outer].ds[ib_inner]);
+            buf_b[buf_ib].ds = FLOAT_TYPEV2(data_b[ib_outer].ds[ib_inner]);
        }

        const ivec4 values = data_b[ib_outer].qs[ib_inner * 2 + iqs];
@ -436,7 +436,7 @@ void block_b_to_shmem(const uint buf_ib, const uint ib, const uint iqs, const bo
        buf_b[buf_ib].qs[iqs * 4 + 3] = values.w;
    } else {
        if (iqs == 0) {
-            buf_b[buf_ib].ds = FLOAT_TYPE_VEC2(0.0f);
+            buf_b[buf_ib].ds = FLOAT_TYPEV2(0.0f);
        }

        buf_b[buf_ib].qs[iqs * 4    ] = 0;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
@ -8,7 +8,7 @@ struct block_a_cache {
 #define QUANT_R_MMQ 2
 struct block_a_cache {
    uint32_t qs[16/4];
-    FLOAT_TYPE_VEC2 dm;
+    FLOAT_TYPEV2 dm;
 };
 #elif defined(DATA_A_Q5_0)
 #define QUANT_R_MMQ 2
@ -22,7 +22,7 @@ struct block_a_cache {
 struct block_a_cache {
    uint32_t qs[16/4];
    uint32_t qh;
-    FLOAT_TYPE_VEC2 dm;
+    FLOAT_TYPEV2 dm;
 };
 #elif defined(DATA_A_Q8_0)
 #define QUANT_R_MMQ 1
@ -43,36 +43,36 @@ struct block_a_cache {
 struct block_a_cache {
    uint32_t qs[2];
    u8vec2 scales;
-    FLOAT_TYPE_VEC2 dm;
+    FLOAT_TYPEV2 dm;
 };
 #elif defined(DATA_A_Q3_K)
 #define QUANT_R_MMQ 2
 struct block_a_cache {
    uint32_t qs[4];
-    FLOAT_TYPE_VEC2 d_scales;
+    FLOAT_TYPEV2 d_scales;
 };
 #elif defined(DATA_A_Q4_K)
 #define QUANT_R_MMQ 2
 struct block_a_cache {
    uint32_t qs[4];
-    FLOAT_TYPE_VEC2 dm;
+    FLOAT_TYPEV2 dm;
 };
 #elif defined(DATA_A_Q5_K)
 #define QUANT_R_MMQ 1
 struct block_a_cache {
    int32_t qs[8];
-    FLOAT_TYPE_VEC2 dm;
+    FLOAT_TYPEV2 dm;
 };
 #elif defined(DATA_A_Q6_K)
 #define QUANT_R_MMQ 1
 struct block_a_cache {
    int32_t qs[8];
-    FLOAT_TYPE_VEC2 d_scales;
+    FLOAT_TYPEV2 d_scales;
 };
 #endif

 struct block_b_cache
 {
    int32_t qs[8];
-    FLOAT_TYPE_VEC2 ds;
+    FLOAT_TYPEV2 ds;
 };
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -445,8 +445,8 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        base_dict["FLOAT16"] = "1";
    }

-    base_dict["ACC_TYPE"     ] = f16acc ? "float16_t" : "float";
-    base_dict["ACC_TYPE_VEC2"] = f16acc ? "f16vec2"   : "vec2";
+    base_dict["ACC_TYPE"  ] = f16acc ? "float16_t" : "float";
+    base_dict["ACC_TYPEV2"] = f16acc ? "f16vec2"   : "vec2";
    if (f16acc) {
        base_dict["ACC_TYPE_MAX"] = "float16_t(65504.0)";
    }
@ -513,10 +513,10 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
    };

    const std::map<std::string, std::string> float_type_dict_f16 = {
-        {"FLOAT_TYPE",      FLOAT_TYPE(1, "f16")},
-        {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "f16")},
-        {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "f16")},
-        {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, "f16")},
+        {"FLOAT_TYPE",   FLOAT_TYPE(1, "f16")},
+        {"FLOAT_TYPEV2", FLOAT_TYPE(2, "f16")},
+        {"FLOAT_TYPEV4", FLOAT_TYPE(4, "f16")},
+        {"FLOAT_TYPEV8", FLOAT_TYPE(8, "f16")},
    };

    // Shaders with f16 B_TYPE
@ -535,9 +535,9 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        std::string to_float_type = (coopmat || coopmat2) ? "uintBitsToBFloat16EXT" : "bf16_to_fp32";

        const std::map<std::string, std::string> float_type_dict_bf16 = {
-            {"FLOAT_TYPE",      FLOAT_TYPE(1, "bf16")},
-            {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "bf16")},
-            {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "bf16")},
+            {"FLOAT_TYPE",   FLOAT_TYPE(1, "bf16")},
+            {"FLOAT_TYPEV2", FLOAT_TYPE(2, "bf16")},
+            {"FLOAT_TYPEV4", FLOAT_TYPE(4, "bf16")},
        };

        // If bfloat16 is not supported, then only compile the scalar (promote to fp32) shader
@ -568,10 +568,10 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;

        const std::map<std::string, std::string> float_type_dict = {
-            {"FLOAT_TYPE",      FLOAT_TYPE(1, tname)},
-            {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, tname)},
-            {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, tname)},
-            {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, tname)},
+            {"FLOAT_TYPE",   FLOAT_TYPE(1, tname)},
+            {"FLOAT_TYPEV2", FLOAT_TYPE(2, tname)},
+            {"FLOAT_TYPEV4", FLOAT_TYPE(4, tname)},
+            {"FLOAT_TYPEV8", FLOAT_TYPE(8, tname)},
        };

        // don't generate f32 variants for coopmat2
@ -675,36 +675,36 @@ void process_shaders() {
        }
    }

-    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}};
+    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}};

    for (const auto& tname : type_names) {
        // mul mat vec
        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";

-        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
-        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}));

-        string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
-        string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));

-        string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
-        string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));

-        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
-        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
-        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32_f32_subgroup_no_shmem", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));

        // mul mat vec with integer dot product
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname) || tname == "iq1_s" || tname == "iq1_m") {
-            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
-            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
-            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}}));
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+            string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));

-            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
-            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
-            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
+            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}}));
+            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
+            string_to_spv("mul_mat_vec_id_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
        }
 #endif

@ -725,9 +725,9 @@ void process_shaders() {

    string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});

-    string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
-    string_to_spv("mul_mat_vec_p021_f16_f32",              "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
-    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
+    string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPEV4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
+    string_to_spv("mul_mat_vec_p021_f16_f32",              "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPEV4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}});
+    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPEV4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}});

    // Norms
    string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));